From eea7f5a4ad9b035e4baee57f9818fe87aa25a099 Mon Sep 17 00:00:00 2001 From: Alex Sharov Date: Sat, 19 Jun 2021 13:31:04 +0700 Subject: [PATCH] use as go module, win support (#21) --- .github/workflows/test.yml | 28 +- .gitignore | 4 +- .golangci.yml | 4 - Makefile | 14 +- README.md | 170 +- go.mod | 2 +- internal/mdbxcmd/cmutil.go | 37 - mdbx/.gitignore | 1 + mdbx/ChangeLog.md | 446 + mdbx/MDBX_LICENSE | 47 + mdbx/cursor.go | 2 +- mdbx/cursor_test.go | 18 +- mdbx/dist/CMakeLists.txt | 58 +- mdbx/dist/ChangeLog.md | 54 +- mdbx/dist/GNUmakefile | 31 +- mdbx/dist/README.md | 29 +- mdbx/dist/VERSION | 2 +- mdbx/dist/man1/mdbx_stat.1 | 13 +- mdbx/dist/mdbx.c | 772 +- mdbx/dist/mdbx.c++ | 91 +- mdbx/dist/mdbx.h | 12 +- mdbx/dist/mdbx_chk.c | 191 +- mdbx/dist/mdbx_copy.c | 85 +- mdbx/dist/mdbx_drop.c | 109 +- mdbx/dist/mdbx_dump.c | 113 +- mdbx/dist/mdbx_load.c | 87 +- mdbx/dist/mdbx_stat.c | 185 +- mdbx/env.go | 108 +- mdbx/env_test.go | 144 +- mdbx/error_unix.go | 1 - mdbx/error_windows.go | 1 - mdbx/internal/arch/width.go | 7 - mdbx/mdbx.c | 29920 +++++++++++++++++++++++++ mdbx/mdbx.go | 14 +- mdbx/mdbx.h | 5099 +++++ mdbx/mdbx_test.go | 12 +- {internal => mdbx}/mdbxarch/width.go | 2 +- mdbx/mdbxgo.c | 2 +- mdbx/mdbxgo.h | 2 +- mdbx/txn.go | 35 +- mdbx/txn_test.go | 286 +- mdbx/val.go | 5 +- 42 files changed, 36767 insertions(+), 1476 deletions(-) delete mode 100644 internal/mdbxcmd/cmutil.go create mode 100644 mdbx/ChangeLog.md create mode 100644 mdbx/MDBX_LICENSE delete mode 100644 mdbx/internal/arch/width.go create mode 100644 mdbx/mdbx.c create mode 100644 mdbx/mdbx.h rename {internal => mdbx}/mdbxarch/width.go (97%) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 688b48c..793a678 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -6,18 +6,12 @@ jobs: strategy: matrix: os: [ ubuntu-16.04, ubuntu-18.04, ubuntu-20.04, macos-10.15 ] # list of os: https://github.com/actions/virtual-environments - # os: [ubuntu-20.04, macos-10.15, windows-latest] # list of os: https://github.com/actions/virtual-environments - go: [ '1.16', '1.13' ] + go: [ '1.16', '1.15' ] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v2 - - name: Add Msys64 to PATH # see https://github.com/actions/virtual-environments/issues/1613 and https://github.com/actions/virtual-environments/pull/1648 - if: matrix.os == 'windows-latest' - run: echo "::add-path::/c/msys64/mingw64/bin:/c/msys64/usr/bin" - shell: bash - - uses: actions/setup-go@v2 with: go-version: ${{ matrix.go }} @@ -27,8 +21,24 @@ jobs: path: ~/go/pkg/mod key: go-${{ matrix.os }}-${{ matrix.go }}-${{ hashFiles('**/go.sum') }} - - name: Test - run: cc --version && make test + - run: go test -v ./... - name: Race run: make race + + win: + strategy: + matrix: + os: [ windows-2019 ] # list of os: https://github.com/actions/virtual-environments + go: [ '1.16' ] + runs-on: ${{ matrix.os }} + + steps: + - uses: actions/checkout@v2 + + - uses: actions/setup-go@v2 + with: + go-version: ${{ matrix.go }} + + - run: choco upgrade mingw cmake -y --no-progress + - run: go test -v ./... diff --git a/.gitignore b/.gitignore index 7e0c2e0..2de7310 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,4 @@ -# c repository for updating after each release -/lmdb/lmdb -/lmdb/openldap +.idea /bin/ /build diff --git a/.golangci.yml b/.golangci.yml index af6ec99..74d158a 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -13,8 +13,6 @@ linters: - errcheck - goconst - gofmt - - golint - - interfacer - govet - structcheck - stylecheck @@ -22,7 +20,6 @@ linters: # - goerr113 - unconvert - unparam - - scopelint - nakedret - prealloc - gosimple @@ -31,7 +28,6 @@ linters: - depguard - typecheck - misspell - - maligned linters-settings: gofmt: diff --git a/Makefile b/Makefile index fbe8b0d..07fea2c 100644 --- a/Makefile +++ b/Makefile @@ -9,20 +9,20 @@ GOLDFLAGS="-X main.branch $(BRANCH) -X main.commit $(COMMIT)" deps: lintci-deps go get -d ./... -all: deps check mdbx-build +all: deps check -test: mdbx-build +test: go test ./mdbx ./exp/mdbxpool -race: mdbx-build +race: go test -race ./mdbx ./exp/mdbxpool -lint: mdbx-build +lint: ./build/bin/golangci-lint run --new-from-rev=$(MASTER_COMMIT) ./... lintci-deps: rm -f ./build/bin/golangci-lint - curl -sfL https://install.goreleaser.com/github.com/golangci/golangci-lint.sh | sh -s -- -b ./build/bin v1.31.0 + curl -sfL https://install.goreleaser.com/github.com/golangci/golangci-lint.sh | sh -s -- -b ./build/bin v1.41.0 check: which goimports > /dev/null @@ -32,7 +32,3 @@ check: clean: cd mdbx/dist/ && make clean - -mdbx-build: - echo "Building mdbx" - cd mdbx/dist/ && make clean && make config.h && CFLAGS_EXTRA="-Wno-deprecated-declarations" make mdbx-static.o diff --git a/README.md b/README.md index f1299b6..e5f631d 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,23 @@ -# mdbx-go +# mdbx-go -Go bindings to the libmdbx: https://github.com/erthink/libmdbx +Go bindings to the libmdbx: https://erthink.github.io/libmdbx/ + +Most of articles in internet about LMDB are applicable to MDBX. But mdbx has more features. + +For deeper DB understanding please read through [mdbx.h](https://github.com/erthink/libmdbx/blob/master/mdbx.h) + +## Min Requirements + +C language Compilers compatible with GCC or CLANG (mingw 10 on windows) +Golang: 1.15 ## Packages -Functionality is logically divided into several packages. Applications will -usually need to import **mdbx** but may import other packages on an as needed -basis. +Functionality is logically divided into several packages. Applications will usually need to import **mdbx** but may +import other packages on an as needed basis. -Packages in the `exp/` directory are not stable and may change without warning. -That said, they are generally usable if application dependencies are managed -and pinned by tag/commit. +Packages in the `exp/` directory are not stable and may change without warning. That said, they are generally usable if +application dependencies are managed and pinned by tag/commit. Developers concerned with package stability should consult the documentation. @@ -24,59 +31,34 @@ Core bindings allowing low-level access to MDBX. #### exp/mdbxpool [![GoDoc](https://godoc.org/github.com/torquem-ch/mdbx-go/mdbx/exp/mdbxpool?status.svg)](https://godoc.org/github.com/torquem-ch/mdbx-go/mdbx/exp/mdbxpool) [![experimental](https://img.shields.io/badge/stability-experimental-red.svg)](#user-content-versioning-and-stability) - ```go import "github.com/torquem-ch/mdbx-go/exp/mdbxpool" ``` -A utility package which facilitates reuse of mdbx.Txn objects using a -sync.Pool. Naively storing mdbx.Txn objects in sync.Pool can be troublesome. -And the mdbxpool.TxnPool type has been defined as a complete pooling solution -and as reference for applications attempting to write their own pooling -implementation. - -The mdbxpool package is relatively new. But it has a lot of potential utility. -And once the mdbxpool API has been ironed out, and the implementation hardened -through use by real applications it can be integrated directly into the mdbx -package for more transparent integration. Please test this package and provide -feedback to speed this process up. +A utility package which facilitates reuse of mdbx.Txn objects using a sync.Pool. Naively storing mdbx.Txn objects in +sync.Pool can be troublesome. And the mdbxpool.TxnPool type has been defined as a complete pooling solution and as +reference for applications attempting to write their own pooling implementation. +The mdbxpool package is relatively new. But it has a lot of potential utility. And once the mdbxpool API has been ironed +out, and the implementation hardened through use by real applications it can be integrated directly into the mdbx +package for more transparent integration. Please test this package and provide feedback to speed this process up. ## Key Features ### Idiomatic API -API inspired by [BoltDB](https://github.com/boltdb/bolt) with automatic -commit/rollback of transactions. The goal of mdbx-go is to provide idiomatic -database interactions without compromising the flexibility of the C API. - -**NOTE:** While the mdbx package tries hard to make MDBX as easy to use as -possible there are compromises, gotchas, and caveats that application -developers must be aware of when relying on MDBX to store their data. All -users are encouraged to fully read the -[documentation](https://godoc.org/github.com/torquem-ch/mdbx-go/mdbx) so they are -aware of these caveats. - -### API coverage - -The mdbx-go project aims for complete coverage of the MDBX C API (within -reason). Some notable features and optimizations that are supported: - -- Idiomatic subtransactions ("sub-updates") that allow the batching of updates. - -- Batch IO on databases utilizing the `MDB_DUPSORT` and `MDB_DUPFIXED` flags. +API inspired by [BoltDB](https://github.com/boltdb/bolt) with automatic commit/rollback of transactions. The goal of +mdbx-go is to provide idiomatic database interactions without compromising the flexibility of the C API. -- Reserved writes than can save in memory copies converting/buffering into - `[]byte`. +**NOTE:** While the mdbx package tries hard to make MDBX as easy to use as possible there are compromises, gotchas, and +caveats that application developers must be aware of when relying on MDBX to store their data. All users are encouraged +to fully read the [documentation](https://erthink.github.io/libmdbx/) so they are aware of these caveats. And even +better if read through [mdbx.h](https://github.com/erthink/libmdbx/blob/master/mdbx.h) -For tracking purposes a list of unsupported features is kept in an -[issue](https://github.com/torquem-ch/mdbx-go/issues/1). +### High Performance notices -### Zero-copy reads - -Applications with high performance requirements can opt-in to fast, zero-copy -reads at the cost of runtime safety. Zero-copy behavior is specified at the -transaction level to reduce instrumentation overhead. +Applications with high performance requirements can opt-in to fast, zero-copy reads at the cost of runtime safety. +Zero-copy behavior is specified at the transaction level to reduce instrumentation overhead. ``` err := mdbx.View(func(txn *mdbx.Txn) error { @@ -89,11 +71,7 @@ err := mdbx.View(func(txn *mdbx.Txn) error { }) ``` -## MDBX compared to BoltDB - -BoltDB is a quality database with a design similar to MDBX. Both store -key-value data in a file and provide ACID transactions. So there are often -questions of why to use one database or the other. +Use NoReadahead if Data > RAM ### Advantages of BoltDB @@ -101,79 +79,61 @@ questions of why to use one database or the other. - Far more databases can be accessed concurrently. -- Operating systems that do not support sparse files do not use up excessive - space due to a large pre-allocation of file space. +- No `Bucket` object - means less allocations and higher performance + +- Operating systems that do not support sparse files do not use up excessive space due to a large pre-allocation of file + space. - As a pure Go package bolt can be easily cross-compiled using the `go` toolchain and `GOOS`/`GOARCH` variables. -- Its simpler design and implementation in pure Go mean it is free of many - caveats and gotchas which are present using the MDBX package. For more - information about caveats with the MDBX package, consult its - [documentation](https://godoc.org/github.com/torquem-ch/mdbx-go/mdbx). +- Its simpler design and implementation in pure Go mean it is free of many caveats and gotchas which are present using + the MDBX package. For more information about caveats with the MDBX package, consult its + [documentation](https://erthink.github.io/libmdbx/) so they are aware of these caveats. And even better if read + through [mdbx.h](https://github.com/erthink/libmdbx/blob/master/mdbx.h). -### Advantages of LMDB and MDBX +### Advantages of MDBX - Keys can contain multiple values using the DupSort flag. - Updates can have sub-updates for atomic batching of changes. -- Databases typically remain open for the application lifetime. This limits - the number of concurrently accessible databases. But, this minimizes the - overhead of database accesses and typically produces cleaner code than - an equivalent BoltDB implementation. - -- Significantly faster than BoltDB. The raw speed of MDBX easily surpasses - BoltDB. Additionally, MDBX provides optimizations ranging from safe, - feature-specific optimizations to generally unsafe, extremely situational - ones. Applications are free to enable any optimizations that fit their data, - access, and reliability models. - -- MDBX allows multiple applications to access a database simultaneously. - Updates from concurrent processes are synchronized using a database lock - file. - -- As a C library, applications in any language can interact with MDBX - databases. Mission critical Go applications can use a database while Python - scripts perform analysis on the side. - -## Build +- Databases typically remain open for the application lifetime. This limits the number of concurrently accessible + databases. But, this minimizes the overhead of database accesses and typically produces cleaner code than an + equivalent BoltDB implementation. -There is no dependency on shared libraries. But it's impossible to use 'go get' for now. Only way is to copy sources of this package to your project, and call `make mdbx-build` manually. See: https://github.com/torquem-ch/mdbx-go/issues/5 +- Significantly faster than BoltDB. The raw speed of MDBX easily surpasses BoltDB. Additionally, MDBX provides + optimizations ranging from safe, feature-specific optimizations to generally unsafe, extremely situational ones. + Applications are free to enable any optimizations that fit their data, access, and reliability models. -On FreeBSD 10, you must explicitly set `CC` (otherwise it will fail with a -cryptic error), for example: +- MDBX allows multiple applications to access a database simultaneously. Updates from concurrent processes are + synchronized using a database lock file. - CC=clang go test -v ./... +- As a C library, applications in any language can interact with MDBX databases. Mission critical Go applications can + use a database while Python scripts perform analysis on the side. -## Documentation - -### Go doc - -The `go doc` documentation available on -[godoc.org](https://godoc.org/github.com/torquem-ch/mdbx-go) is the primary source -of developer documentation for mdbx-go. It provides an overview of the API -with a lot of usage examples. Where necessary the documentation points out -differences between the semantics of methods and their C counterparts. +## Build -### LMDB +On FreeBSD 10, you must explicitly set `CC` (otherwise it will fail with a cryptic error), for example: -The LMDB [homepage](http://symas.com/mdb/) + CC=clang go test -v ./... -### MDBX +## Documentation -The MDBX [homepage](https://github.com/erthink/libmdbx) +- Examples see in *_test.go files of this repo +- [The MDBX](https://erthink.github.io/libmdbx/) And even better if read + through [mdbx.h](https://github.com/erthink/libmdbx/blob/master/mdbx.h). +- [godoc.org](https://godoc.org/github.com/torquem-ch/mdbx-go) +- [The LMDB](http://symas.com/mdb/) ### Versioning and Stability -The mdbx-go project makes regular releases with IDs `X.Y.Z`. All packages -outside of the `exp/` directory are considered stable and adhere to the -guidelines of [semantic versioning](http://semver.org/). +The mdbx-go project makes regular releases with IDs `X.Y.Z`. All packages outside of the `exp/` directory are considered +stable and adhere to the guidelines of [semantic versioning](http://semver.org/). -Experimental packages (those packages in `exp/`) are not required to adhere to -semantic versioning. However packages specifically declared to merely be +Experimental packages (those packages in `exp/`) are not required to adhere to semantic versioning. However packages +specifically declared to merely be "unstable" can be relied on more for long term use with less concern. -The API of an unstable package may change in subtle ways between minor release -versions. But deprecations will be indicated at least one release in advance -and all functionality will remain available through some method. +The API of an unstable package may change in subtle ways between minor release versions. But deprecations will be +indicated at least one release in advance and all functionality will remain available through some method. diff --git a/go.mod b/go.mod index 55e08d2..736dec2 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,3 @@ module github.com/torquem-ch/mdbx-go -go 1.13 +go 1.15 diff --git a/internal/mdbxcmd/cmutil.go b/internal/mdbxcmd/cmutil.go deleted file mode 100644 index 1593032..0000000 --- a/internal/mdbxcmd/cmutil.go +++ /dev/null @@ -1,37 +0,0 @@ -package mdbxcmd - -import ( - "flag" -) - -var flagPrintVersion bool -var flagOpenNoSubDir bool - -func init() { - flag.BoolVar(&flagPrintVersion, "V", false, "Write the library version number to the standard output, and exit.") - flag.BoolVar(&flagOpenNoSubDir, "n", false, "Open LDMB environment(s) which do not use subdirectories.") -} - -//func printVersion(w io.Writer) { -// fmt.Fprintln(w, mdbx.VersionString()) -//} -// -//// PrintVersion writes the API version in a human readable format to -//// os.Stdout. -//func PrintVersion() { -// if flagPrintVersion { -// printVersion(os.Stdout) -// os.Exit(0) -// } -//} - -// OpenFlag returns the bitwise OR'd set of flags specified by options defined -// in the package. The returned value may be OR'd with additional flags if -// needed. -//func OpenFlag() uint { -// var flag uint -// if flagOpenNoSubDir { -// flag |= mdbx.NoSubdir -// } -// return flag -//} diff --git a/mdbx/.gitignore b/mdbx/.gitignore index 416ed56..18e8641 100644 --- a/mdbx/.gitignore +++ b/mdbx/.gitignore @@ -2,6 +2,7 @@ *.dylib *.o config.h +config2.h config_darwin.h config_linux.h config_win.h diff --git a/mdbx/ChangeLog.md b/mdbx/ChangeLog.md new file mode 100644 index 0000000..76a1416 --- /dev/null +++ b/mdbx/ChangeLog.md @@ -0,0 +1,446 @@ +ChangeLog +--------- + +## v0.10.2 (in development) + +### TODO + + - [Move most of `mdbx_chk` functional to the library API](https://github.com/erthink/libmdbx/issues/204). + - [Replace SRW-lock on Windows to allow shrink DB with `MDBX_NOTLS` option](https://github.com/erthink/libmdbx/issues/210). + - [More flexible support of asynchronous runtime/framework(s)](https://github.com/erthink/libmdbx/issues/200). + - [Migration guide from LMDB to MDBX](https://github.com/erthink/libmdbx/issues/199). + - [Get rid of dirty-pages list in MDBX_WRITEMAP mode](https://github.com/erthink/libmdbx/issues/193). + - [Large/Overflow pages accounting for dirty-room](https://github.com/erthink/libmdbx/issues/192). + - [C++ Buffer issue](https://github.com/erthink/libmdbx/issues/191). + - [Support for RAW devices](https://github.com/erthink/libmdbx/issues/124). + - [Test framework issue](https://github.com/erthink/libmdbx/issues/127). + - [Support MessagePack for Keys & Values](https://github.com/erthink/libmdbx/issues/115). + - [Engage new terminology](https://github.com/erthink/libmdbx/issues/137). + - Finalize C++ API (few typos and trivia bugs are still likely for now). + - Packages for [Astra Linux](https://astralinux.ru/), [ALT Linux](https://www.altlinux.org/), [ROSA Linux](https://www.rosalinux.ru/), etc. + +Acknowledgements: + + - [Alex Sharov](https://github.com/AskAlexSharov) for reporting and testing. + - [Andrea Lanfranchi](https://github.com/AndreaLanfranchi) for reporting bugs. + +New features: + + - [Allow to predefine/override `MDBX_BUILD_TIMESTAMP` for builds reproducibility](https://github.com/erthink/libmdbx/issues/201). + +Fixes: + + - Fixed excess meta-pages checks in case `mdbx_chk` is called to check the DB for a specific meta page and thus could prevent switching to the selected meta page, even if the check passed without errors. + - Fixed [recursive use of SRW-lock on Windows cause by `MDBX_NOTLS` option](https://github.com/erthink/libmdbx/issues/203). + - Fixed [log a warning during a new DB creation](https://github.com/erthink/libmdbx/issues/205). + - Fixed [false-negative `mdbx_cursor_eof()` result](https://github.com/erthink/libmdbx/issues/207). + - Fixed [`make install` with non-GNU `install` utility (OSX, BSD)](https://github.com/erthink/libmdbx/issues/208). + - Fixed [installation by `CMake` in special cases by complete use `GNUInstallDirs`'s variables](https://github.com/erthink/libmdbx/issues/209). + + +## v0.10.1 at 2021-06-01 + +Acknowledgements: + + - [Alexey Akhunov](https://github.com/AlexeyAkhunov) and [Alex Sharov](https://github.com/AskAlexSharov) for bug reporting and testing. + - [Andrea Lanfranchi](https://github.com/AndreaLanfranchi) for bug reporting and testing related to WSL2. + +New features: + + - Added `-p` option to `mdbx_stat` utility for printing page operations statistic. + - Added explicit checking for and warning about using unfit github's archives. + - Added fallback from [OFD locking](https://bit.ly/3yFRtYC) to legacy non-OFD POSIX file locks on an `EINVAL` error. + - Added [Plan 9](https://en.wikipedia.org/wiki/9P_(protocol)) network file system to the whitelist for an ability to open a DB in exclusive mode. + - Support for opening from WSL2 environment a DB hosted on Windows drive and mounted via [DrvFs](https://docs.microsoft.com/it-it/archive/blogs/wsl/wsl-file-system-support#drvfs) (i.e by Plan 9 noted above). + +Fixes: + + - Fixed minor "foo not used" warnings from modern C++ compilers when building the C++ part of the library. + - Fixed confusing/messy errors when build library from unfit github's archives (https://github.com/erthink/libmdbx/issues/197). + - Fixed `#​e​l​s​i​f` typo. + - Fixed rare unexpected `MDBX_PROBLEM` error during altering data in huge transactions due to wrong spilling/oust of dirty pages (https://github.com/erthink/libmdbx/issues/195). + - Re-Fixed WSL1/WSL2 detection with distinguishing (https://github.com/erthink/libmdbx/issues/97). + + +## v0.10.0 at 2021-05-09 + +Acknowledgements: + + - [Mahlon E. Smith](https://github.com/mahlonsmith) for [Ruby bindings](https://rubygems.org/gems/mdbx/). + - [Alex Sharov](https://github.com/AskAlexSharov) for [mdbx-go](https://github.com/torquem-ch/mdbx-go), bug reporting and testing. + - [Artem Vorotnikov](https://github.com/vorot93) for bug reporting and PR. + - [Paolo Rebuffo](https://www.linkedin.com/in/paolo-rebuffo-8255766/), [Alexey Akhunov](https://github.com/AlexeyAkhunov) and Mark Grosberg for donations. + - [Noel Kuntze](https://github.com/Thermi) for preliminary [Python bindings](https://github.com/Thermi/libmdbx/tree/python-bindings) + +New features: + + - Added `mdbx_env_set_option()` and `mdbx_env_get_option()` for controls + various runtime options for an environment (announce of this feature was missed in a previous news). + - Added `MDBX_DISABLE_PAGECHECKS` build option to disable some checks to reduce an overhead + and detection probability of database corruption to a values closer to the LMDB. + The `MDBX_DISABLE_PAGECHECKS=1` provides a performance boost of about 10% in CRUD scenarios, + and conjointly with the `MDBX_ENV_CHECKPID=0` and `MDBX_TXN_CHECKOWNER=0` options can yield + up to 30% more performance compared to LMDB. + - Using float point (exponential quantized) representation for internal 16-bit values + of grow step and shrink threshold when huge ones (https://github.com/erthink/libmdbx/issues/166). + To minimize the impact on compatibility, only the odd values inside the upper half + of the range (i.e. 32769..65533) are used for the new representation. + - Added the `mdbx_drop` similar to LMDB command-line tool to purge or delete (sub)database(s). + - [Ruby bindings](https://rubygems.org/gems/mdbx/) is available now by [Mahlon E. Smith](https://github.com/mahlonsmith). + - Added `MDBX_ENABLE_MADVISE` build option which controls the use of POSIX `madvise()` hints and friends. + - The internal node sizes were refined, resulting in a reduction in large/overflow pages in some use cases + and a slight increase in limits for a keys size to ≈½ of page size. + - Added to `mdbx_chk` output number of keys/items on pages. + - Added explicit `install-strip` and `install-no-strip` targets to the `Makefile` (https://github.com/erthink/libmdbx/pull/180). + - Major rework page splitting (af9b7b560505684249b76730997f9e00614b8113) for + - An "auto-appending" feature upon insertion for both ascending and + descending key sequences. As a result, the optimality of page filling + increases significantly (more densely, less slackness) while + inserting ordered sequences of keys, + - A "splitting at middle" to make page tree more balanced on average. + - Added `mdbx_get_sysraminfo()` to the API. + - Added guessing a reasonable maximum DB size for the default upper limit of geometry (https://github.com/erthink/libmdbx/issues/183). + - Major rework internal labeling of a dirty pages (958fd5b9479f52f2124ab7e83c6b18b04b0e7dda) for + a "transparent spilling" feature with the gist to make a dirty pages + be ready to spilling (writing to a disk) without further altering ones. + Thus in the `MDBX_WRITEMAP` mode the OS kernel able to oust dirty pages + to DB file without further penalty during transaction commit. + As a result, page swapping and I/O could be significantly reduced during extra large transactions and/or lack of memory. + - Minimized reading leaf-pages during dropping subDB(s) and nested trees. + - Major rework a spilling of dirty pages to support [LRU](https://en.wikipedia.org/wiki/Cache_replacement_policies#Least_recently_used_(LRU)) + policy and prioritization for a large/overflow pages. + - Statistics of page operations (split, merge, copy, spill, etc) now available through `mdbx_env_info_ex()`. + - Auto-setup limit for length of dirty pages list (`MDBX_opt_txn_dp_limit` option). + - Support `make options` to list available build options. + - Support `make help` to list available make targets. + - Silently `make`'s build by default. + - Preliminary [Python bindings](https://github.com/Thermi/libmdbx/tree/python-bindings) is available now + by [Noel Kuntze](https://github.com/Thermi) (https://github.com/erthink/libmdbx/issues/147). + +Backward compatibility break: + + - The `MDBX_AVOID_CRT` build option was renamed to `MDBX_WITHOUT_MSVC_CRT`. + This option is only relevant when building for Windows. + - The `mdbx_env_stat()` always, and `mdbx_env_stat_ex()` when called with the zeroed transaction parameter, + now internally start temporary read transaction and thus may returns `MDBX_BAD_RSLOT` error. + So, just never use deprecated `mdbx_env_stat()' and call `mdbx_env_stat_ex()` with transaction parameter. + - The build option `MDBX_CONFIG_MANUAL_TLS_CALLBACK` was removed and now just a non-zero value of + the `MDBX_MANUAL_MODULE_HANDLER` macro indicates the requirement to manually call `mdbx_module_handler()` + when loading libraries and applications uses statically linked libmdbx on an obsolete Windows versions. + +Fixes: + + - Fixed performance regression due non-optimal C11 atomics usage (https://github.com/erthink/libmdbx/issues/160). + - Fixed "reincarnation" of subDB after it deletion (https://github.com/erthink/libmdbx/issues/168). + - Fixed (disallowing) implicit subDB deletion via operations on `@MAIN`'s DBI-handle. + - Fixed a crash of `mdbx_env_info_ex()` in case of a call for a non-open environment (https://github.com/erthink/libmdbx/issues/171). + - Fixed the selecting/adjustment values inside `mdbx_env_set_geometry()` for implicit out-of-range cases (https://github.com/erthink/libmdbx/issues/170). + - Fixed `mdbx_env_set_option()` for set initial and limit size of dirty page list ((https://github.com/erthink/libmdbx/issues/179). + - Fixed an unreasonably huge default upper limit for DB geometry (https://github.com/erthink/libmdbx/issues/183). + - Fixed `constexpr` specifier for the `slice::invalid()`. + - Fixed (no)readahead auto-handling (https://github.com/erthink/libmdbx/issues/164). + - Fixed non-alloy build for Windows. + - Switched to using Heap-functions instead of LocalAlloc/LocalFree on Windows. + - Fixed `mdbx_env_stat_ex()` to returning statistics of the whole environment instead of MainDB only (https://github.com/erthink/libmdbx/issues/190). + - Fixed building by GCC 4.8.5 (added workaround for a preprocessor's bug). + - Fixed building C++ part for iOS <= 13.0 (unavailability of `std::filesystem::path`). + - Fixed building for Windows target versions prior to Windows Vista (`WIN32_WINNT < 0x0600`). + - Fixed building by MinGW for Windows (https://github.com/erthink/libmdbx/issues/155). + + +## v0.9.3 at 2021-02-02 + +Acknowledgements: + + - [Mahlon E. Smith](http://www.martini.nu/) for [FreeBSD port of libmdbx](https://svnweb.freebsd.org/ports/head/databases/mdbx/). + - [장세연](http://www.castis.com) for bug fixing and PR. + - [Clément Renault](https://github.com/Kerollmops/heed) for [Heed](https://github.com/Kerollmops/heed) fully typed Rust wrapper. + - [Alex Sharov](https://github.com/AskAlexSharov) for bug reporting. + - [Noel Kuntze](https://github.com/Thermi) for bug reporting. + +Removed options and features: + + - Drop `MDBX_HUGE_TRANSACTIONS` build-option (now no longer required). + +New features: + + - Package for FreeBSD is available now by Mahlon E. Smith. + - New API functions to get/set various options (https://github.com/erthink/libmdbx/issues/128): + - the maximum number of named databases for the environment; + - the maximum number of threads/reader slots; + - threshold (since the last unsteady commit) to force flush the data buffers to disk; + - relative period (since the last unsteady commit) to force flush the data buffers to disk; + - limit to grow a list of reclaimed/recycled page's numbers for finding a sequence of contiguous pages for large data items; + - limit to grow a cache of dirty pages for reuse in the current transaction; + - limit of a pre-allocated memory items for dirty pages; + - limit of dirty pages for a write transaction; + - initial allocation size for dirty pages list of a write transaction; + - maximal part of the dirty pages may be spilled when necessary; + - minimal part of the dirty pages should be spilled when necessary; + - how much of the parent transaction dirty pages will be spilled while start each child transaction; + - Unlimited/Dynamic size of retired and dirty page lists (https://github.com/erthink/libmdbx/issues/123). + - Added `-p` option (purge subDB before loading) to `mdbx_load` tool. + - Reworked spilling of large transaction and committing of nested transactions: + - page spilling code reworked to avoid the flaws and bugs inherited from LMDB; + - limit for number of dirty pages now is controllable at runtime; + - a spilled pages, including overflow/large pages, now can be reused and refunded/compactified in nested transactions; + - more effective refunding/compactification especially for the loosed page cache. + - Added `MDBX_ENABLE_REFUND` and `MDBX_PNL_ASCENDING` internal/advanced build options. + - Added `mdbx_default_pagesize()` function. + - Better support architectures with a weak/relaxed memory consistency model (ARM, AARCH64, PPC, MIPS, RISC-V, etc) by means [C11 atomics](https://en.cppreference.com/w/c/atomic). + - Speed up page number lists and dirty page lists (https://github.com/erthink/libmdbx/issues/132). + - Added `LIBMDBX_NO_EXPORTS_LEGACY_API` build option. + +Fixes: + + - Fixed missing cleanup (null assigned) in the C++ commit/abort (https://github.com/erthink/libmdbx/pull/143). + - Fixed `mdbx_realloc()` for case of nullptr and `MDBX_WITHOUT_MSVC_CRT=ON` for Windows. + - Fixed the possibility to use invalid and renewed (closed & re-opened, dropped & re-created) DBI-handles (https://github.com/erthink/libmdbx/issues/146). + - Fixed 4-byte aligned access to 64-bit integers, including access to the `bootid` meta-page's field (https://github.com/erthink/libmdbx/issues/153). + - Fixed minor/potential memory leak during page flushing and unspilling. + - Fixed handling states of cursors's and subDBs's for nested transactions. + - Fixed page leak in extra rare case the list of retired pages changed during update GC on transaction commit. + - Fixed assertions to avoid false-positive UB detection by CLANG/LLVM (https://github.com/erthink/libmdbx/issues/153). + - Fixed `MDBX_TXN_FULL` and regressive `MDBX_KEYEXIST` during large transaction commit with `MDBX_LIFORECLAIM` (https://github.com/erthink/libmdbx/issues/123). + - Fixed auto-recovery (`weak->steady` with the same boot-id) when Database size at last weak checkpoint is large than at last steady checkpoint. + - Fixed operation on systems with unusual small/large page size, including PowerPC (https://github.com/erthink/libmdbx/issues/157). + + +## v0.9.2 at 2020-11-27 + +Acknowledgements: + + - Jens Alfke (Mobile Architect at [Couchbase](https://www.couchbase.com/)) for [NimDBX](https://github.com/snej/nimdbx). + - Clément Renault (CTO at [MeiliSearch](https://www.meilisearch.com/)) for [mdbx-rs](https://github.com/Kerollmops/mdbx-rs). + - Alex Sharov (Go-Lang Teach Lead at [TurboGeth/Ethereum](https://ethereum.org/)) for an extreme test cases and bug reporting. + - George Hazan (CTO at [Miranda NG](https://www.miranda-ng.org/)) for bug reporting. + - [Positive Technologies](https://www.ptsecurity.com/) for funding and [The Standoff](https://standoff365.com/). + +Added features: + + - Provided package for [buildroot](https://buildroot.org/). + - Binding for Nim is [available](https://github.com/snej/nimdbx) now by Jens Alfke. + - Added `mdbx_env_delete()` for deletion an environment files in a proper and multiprocess-safe way. + - Added `mdbx_txn_commit_ex()` with collecting latency information. + - Fast completion pure nested transactions. + - Added `LIBMDBX_INLINE_API` macro and inline versions of some API functions. + - Added `mdbx_cursor_copy()` function. + - Extended tests for checking cursor tracking. + - Added `MDBX_SET_LOWERBOUND` operation for `mdbx_cursor_get()`. + +Fixes: + + - Fixed missing installation of `mdbx.h++`. + - Fixed use of obsolete `__noreturn`. + - Fixed use of `yield` instruction on ARM if unsupported. + - Added pthread workaround for buggy toolchain/cmake/buildroot. + - Fixed use of `pthread_yield()` for non-GLIBC. + - Fixed use of `RegGetValueA()` on Windows 2000/XP. + - Fixed use of `GetTickCount64()` on Windows 2000/XP. + - Fixed opening DB on a network shares (in the exclusive mode). + - Fixed copy&paste typos. + - Fixed minor false-positive GCC warning. + - Added workaround for broken `DEFINE_ENUM_FLAG_OPERATORS` from Windows SDK. + - Fixed cursor state after multimap/dupsort repeated deletes (https://github.com/erthink/libmdbx/issues/121). + - Added `SIGPIPE` suppression for internal thread during `mdbx_env_copy()`. + - Fixed extra-rare `MDBX_KEY_EXIST` error during `mdbx_commit()` (https://github.com/erthink/libmdbx/issues/131). + - Fixed spilled pages checking (https://github.com/erthink/libmdbx/issues/126). + - Fixed `mdbx_load` for 'plain text' and without `-s name` cases (https://github.com/erthink/libmdbx/issues/136). + - Fixed save/restore/commit of cursors for nested transactions. + - Fixed cursors state in rare/special cases (move next beyond end-of-data, after deletion and so on). + - Added workaround for MSVC 19.28 (Visual Studio 16.8) (but may still hang during compilation). + - Fixed paranoidal Clang C++ UB for bitwise operations with flags defined by enums. + - Fixed large pages checking (for compatibility and to avoid false-positive errors from `mdbx_chk`). + - Added workaround for Wine (https://github.com/miranda-ng/miranda-ng/issues/1209). + - Fixed `ERROR_NOT_SUPPORTED` while opening DB by UNC pathnames (https://github.com/miranda-ng/miranda-ng/issues/2627). + - Added handling `EXCEPTION_POSSIBLE_DEADLOCK` condition for Windows. + + +## v0.9.1 2020-09-30 + +Added features: + + - Preliminary C++ API with support for C++17 polymorphic allocators. + - [Online C++ API reference](https://erthink.github.io/libmdbx/) by Doxygen. + - Quick reference for Insert/Update/Delete operations. + - Explicit `MDBX_SYNC_DURABLE` to sync modes for API clarity. + - Explicit `MDBX_ALLDUPS` and `MDBX_UPSERT` for API clarity. + - Support for read transactions preparation (`MDBX_TXN_RDONLY_PREPARE` flag). + - Support for cursor preparation/(pre)allocation and reusing (`mdbx_cursor_create()` and `mdbx_cursor_bind()` functions). + - Support for checking database using specified meta-page (see `mdbx_chk -h`). + - Support for turn to the specific meta-page after checking (see `mdbx_chk -h`). + - Support for explicit reader threads (de)registration. + - The `mdbx_txn_break()` function to explicitly mark a transaction as broken. + - Improved handling of corrupted databases by `mdbx_chk` utility and `mdbx_walk_tree()` function. + - Improved DB corruption detection by checking parent-page-txnid. + - Improved opening large DB (> 4Gb) from 32-bit code. + - Provided `pure-function` and `const-function` attributes to C API. + - Support for user-settable context for transactions & cursors. + - Revised API and documentation related to Handle-Slow-Readers callback feature. + +Deprecated functions and flags: + + - For clarity and API simplification the `MDBX_MAPASYNC` flag is deprecated. + Just use `MDBX_SAFE_NOSYNC` or `MDBX_UTTERLY_NOSYNC` instead of it. + - `MDBX_oom_func`, `mdbx_env_set_oomfunc()` and `mdbx_env_get_oomfunc()` + replaced with `MDBX_hsr_func`, `mdbx_env_get_hsr` and `mdbx_env_get_hsr()`. + +Fixes: + + - Fix `mdbx_strerror()` for `MDBX_BUSY` error (no error description is returned). + - Fix update internal meta-geo information in read-only mode (`EACCESS` or `EBADFD` error). + - Fix `mdbx_page_get()` null-defer when DB corrupted (crash by `SIGSEGV`). + - Fix `mdbx_env_open()` for re-opening after non-fatal errors (`mdbx_chk` unexpected failures). + - Workaround for MSVC 19.27 `static_assert()` bug. + - Doxygen descriptions and refinement. + - Update Valgrind's suppressions. + - Workaround to avoid infinite loop of 'nested' testcase on MIPS under QEMU. + - Fix a lot of typos & spelling (Thanks to Josh Soref for PR). + - Fix `getopt()` messages for Windows (Thanks to Andrey Sporaw for reporting). + - Fix MSVC compiler version requirements (Thanks to Andrey Sporaw for reporting). + - Workarounds for QEMU's bugs to run tests for cross-builded library under QEMU. + - Now C++ compiler optional for building by CMake. + + +## v0.9.0 2020-07-31 (not a release, but API changes) + +Added features: + + - [Online C API reference](https://erthink.github.io/libmdbx/) by Doxygen. + - Separated enums for environment, sub-databases, transactions, copying and data-update flags. + +Deprecated functions and flags: + + - Usage of custom comparators and the `mdbx_dbi_open_ex()` are deprecated, since such databases couldn't be checked by the `mdbx_chk` utility. + Please use the value-to-key functions to provide keys that are compatible with the built-in libmdbx comparators. + + +## v0.8.2 2020-07-06 +- Added support multi-opening the same DB in a process with SysV locking (BSD). +- Fixed warnings & minors for LCC compiler (E2K). +- Enabled to simultaneously open the same database from processes with and without the `MDBX_WRITEMAP` option. +- Added key-to-value, `mdbx_get_keycmp()` and `mdbx_get_datacmp()` functions (helpful to avoid using custom comparators). +- Added `ENABLE_UBSAN` CMake option to enabling the UndefinedBehaviorSanitizer from GCC/CLANG. +- Workaround for [CLANG bug](https://bugs.llvm.org/show_bug.cgi?id=43275). +- Returning `MDBX_CORRUPTED` in case all meta-pages are weak and no other error. +- Refined mode bits while auto-creating LCK-file. +- Avoids unnecessary database file re-mapping in case geometry changed by another process(es). + From the user's point of view, the `MDBX_UNABLE_EXTEND_MAPSIZE` error will now be returned less frequently and only when using the DB in the current process really requires it to be reopened. +- Remapping on-the-fly and of the database file was implemented. + Now remapping with a change of address is performed automatically if there are no dependent readers in the current process. + + +## v0.8.1 2020-06-12 +- Minor change versioning. The last number in the version now means the number of commits since last release/tag. +- Provide ChangeLog file. +- Fix for using libmdbx as a C-only sub-project with CMake. +- Fix `mdbx_env_set_geometry()` for case it is called from an opened environment outside of a write transaction. +- Add support for huge transactions and `MDBX_HUGE_TRANSACTIONS` build-option (default `OFF`). +- Refine LTO (link time optimization) for clang. +- Force enabling exceptions handling for MSVC (`/EHsc` option). + + +## v0.8.0 2020-06-05 +- Support for Android/Bionic. +- Support for iOS. +- Auto-handling `MDBX_NOSUBDIR` while opening for any existing database. +- Engage github-actions to make release-assets. +- Clarify API description. +- Extended keygen-cases in stochastic test. +- Fix fetching of first/lower key from LEAF2-page during page merge. +- Fix missing comma in array of error messages. +- Fix div-by-zero while copy-with-compaction for non-resizable environments. +- Fixes & enhancements for custom-comparators. +- Fix `MDBX_WITHOUT_MSVC_CRT` option and missing `ntdll.def`. +- Fix `mdbx_env_close()` to work correctly called concurrently from several threads. +- Fix null-deref in an ASAN-enabled builds while opening the environment with error and/or read-only. +- Fix AddressSanitizer errors after closing the environment. +- Fix/workaround to avoid GCC 10.x pedantic warnings. +- Fix using `ENODATA` for FreeBSD. +- Avoid invalidation of DBI-handle(s) when it just closes. +- Avoid using `pwritev()` for single-writes (up to 10% speedup for some kernels & scenarios). +- Avoiding `MDBX_UTTERLY_NOSYNC` as result of flags merge. +- Add `mdbx_dbi_dupsort_depthmask()` function. +- Add `MDBX_CP_FORCE_RESIZEABLE` option. +- Add deprecated `MDBX_MAP_RESIZED` for compatibility. +- Add `MDBX_BUILD_TOOLS` option (default `ON`). +- Refine `mdbx_dbi_open_ex()` to safe concurrently opening the same handle from different threads. +- Truncate clk-file during environment closing. So a zero-length lck-file indicates that the environment was closed properly. +- Refine `mdbx_update_gc()` for huge transactions with small sizes of database page. +- Extends dump/load to support all MDBX attributes. +- Avoid upsertion the same key-value data, fix related assertions. +- Rework min/max length checking for keys & values. +- Checking the order of keys on all pages during checking. +- Support `CFLAGS_EXTRA` make-option for convenience. +- Preserve the last txnid while copying with compactification. +- Auto-reset running transaction in mdbx_txn_renew(). +- Automatically abort errored transaction in mdbx_txn_commit(). +- Auto-choose page size for large databases. +- Rearrange source files, rework build, options-support by CMake. +- Crutch for WSL1 (Windows subsystem for Linux). +- Refine install/uninstall targets. +- Support for Valgrind 3.14 and later. +- Add check-analyzer check-ubsan check-asan check-leak targets to Makefile. +- Minor fix/workaround to avoid UBSAN traps for `memcpy(ptr, NULL, 0)`. +- Avoid some GCC-analyzer false-positive warnings. + + +## v0.7.0 2020-03-18 +- Workarounds for Wine (Windows compatibility layer for Linux). +- `MDBX_MAP_RESIZED` renamed to `MDBX_UNABLE_EXTEND_MAPSIZE`. +- Clarify API description, fix typos. +- Speedup runtime checks in debug/checked builds. +- Added checking for read/write transactions overlapping for the same thread, added `MDBX_TXN_OVERLAPPING` error and `MDBX_DBG_LEGACY_OVERLAP` option. +- Added `mdbx_key_from_jsonInteger()`, `mdbx_key_from_double()`, `mdbx_key_from_float()`, `mdbx_key_from_int64()` and `mdbx_key_from_int32()` functions. See `mdbx.h` for description. +- Fix compatibility (use zero for invalid DBI). +- Refine/clarify error messages. +- Avoids extra error messages "bad txn" from mdbx_chk when DB is corrupted. + + +## v0.6.0 2020-01-21 +- Fix `mdbx_load` utility for custom comparators. +- Fix checks related to `MDBX_APPEND` flag inside `mdbx_cursor_put()`. +- Refine/fix dbi_bind() internals. +- Refine/fix handling `STATUS_CONFLICTING_ADDRESSES`. +- Rework `MDBX_DBG_DUMP` option to avoid disk I/O performance degradation. +- Add built-in help to test tool. +- Fix `mdbx_env_set_geometry()` for large page size. +- Fix env_set_geometry() for large pagesize. +- Clarify API description & comments, fix typos. + + +## v0.5.0 2019-12-31 +- Fix returning MDBX_RESULT_TRUE from page_alloc(). +- Fix false-positive ASAN issue. +- Fix assertion for `MDBX_NOTLS` option. +- Rework `MADV_DONTNEED` threshold. +- Fix `mdbx_chk` utility for don't checking some numbers if walking on the B-tree was disabled. +- Use page's mp_txnid for basic integrity checking. +- Add `MDBX_FORCE_ASSERTIONS` built-time option. +- Rework `MDBX_DBG_DUMP` to avoid performance degradation. +- Rename `MDBX_NOSYNC` to `MDBX_SAFE_NOSYNC` for clarity. +- Interpret `ERROR_ACCESS_DENIED` from `OpenProcess()` as 'process exists'. +- Avoid using `FILE_FLAG_NO_BUFFERING` for compatibility with small database pages. +- Added install section for CMake. + + +## v0.4.0 2019-12-02 +- Support for Mac OSX, FreeBSD, NetBSD, OpenBSD, DragonFly BSD, OpenSolaris, OpenIndiana (AIX and HP-UX pending). +- Use bootid for decisions of rollback. +- Counting retired pages and extended transaction info. +- Add `MDBX_ACCEDE` flag for database opening. +- Using OFD-locks and tracking for in-process multi-opening. +- Hot backup into pipe. +- Support for cmake & amalgamated sources. +- Fastest internal sort implementation. +- New internal dirty-list implementation with lazy sorting. +- Support for lazy-sync-to-disk with polling. +- Extended key length. +- Last update transaction number for each sub-database. +- Automatic read ahead enabling/disabling. +- More auto-compactification. +- Using -fsanitize=undefined and -Wpedantic options. +- Rework page merging. +- Nested transactions. +- API description. +- Checking for non-local filesystems to avoid DB corruption. diff --git a/mdbx/MDBX_LICENSE b/mdbx/MDBX_LICENSE new file mode 100644 index 0000000..05ad757 --- /dev/null +++ b/mdbx/MDBX_LICENSE @@ -0,0 +1,47 @@ +The OpenLDAP Public License + Version 2.8, 17 August 2003 + +Redistribution and use of this software and associated documentation +("Software"), with or without modification, are permitted provided +that the following conditions are met: + +1. Redistributions in source form must retain copyright statements + and notices, + +2. Redistributions in binary form must reproduce applicable copyright + statements and notices, this list of conditions, and the following + disclaimer in the documentation and/or other materials provided + with the distribution, and + +3. Redistributions must contain a verbatim copy of this document. + +The OpenLDAP Foundation may revise this license from time to time. +Each revision is distinguished by a version number. You may use +this Software under terms of this license revision or under the +terms of any subsequent revision of the license. + +THIS SOFTWARE IS PROVIDED BY THE OPENLDAP FOUNDATION AND ITS +CONTRIBUTORS ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE OPENLDAP FOUNDATION, ITS CONTRIBUTORS, OR THE AUTHOR(S) +OR OWNER(S) OF THE SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +The names of the authors and copyright holders must not be used in +advertising or otherwise to promote the sale, use or other dealing +in this Software without specific, written prior permission. Title +to copyright in this Software shall at all times remain with copyright +holders. + +OpenLDAP is a registered trademark of the OpenLDAP Foundation. + +Copyright 1999-2003 The OpenLDAP Foundation, Redwood City, +California, USA. All Rights Reserved. Permission to copy and +distribute verbatim copies of this document is granted. diff --git a/mdbx/cursor.go b/mdbx/cursor.go index e3ae61d..e29dbae 100644 --- a/mdbx/cursor.go +++ b/mdbx/cursor.go @@ -92,7 +92,7 @@ func (c *Cursor) Renew(txn *Txn) error { func (c *Cursor) Close() { if c._c != nil { if c.txn._txn == nil && !c.txn.readonly { - // the cursor has already been released by LMDB. + // the cursor has already been released by MDBX. } else { C.mdbx_cursor_close(c._c) } diff --git a/mdbx/cursor_test.go b/mdbx/cursor_test.go index 97a1767..757eda2 100644 --- a/mdbx/cursor_test.go +++ b/mdbx/cursor_test.go @@ -13,7 +13,6 @@ import ( func TestCursor_Txn(t *testing.T) { env := setup(t) - defer clean(env, t) var db DBI err := env.Update(func(txn *Txn) (err error) { @@ -48,7 +47,6 @@ func TestCursor_Txn(t *testing.T) { func TestCursor_DBI(t *testing.T) { env := setup(t) - defer clean(env, t) err := env.Update(func(txn *Txn) (err error) { db, err := txn.OpenDBI("db", Create, nil, nil) @@ -81,7 +79,6 @@ func TestCursor_DBI(t *testing.T) { func TestCursor_Close(t *testing.T) { env := setup(t) - defer clean(env, t) runtime.LockOSThread() defer runtime.UnlockOSThread() @@ -111,7 +108,6 @@ func TestCursor_Close(t *testing.T) { func TestCursor_bytesBuffer(t *testing.T) { env := setup(t) - defer clean(env, t) db, err := openRoot(env, 0) if err != nil { @@ -164,7 +160,6 @@ func TestCursor_bytesBuffer(t *testing.T) { func TestCursor_PutReserve(t *testing.T) { env := setup(t) - defer clean(env, t) var db DBI key := "reservekey" @@ -210,7 +205,6 @@ func TestCursor_PutReserve(t *testing.T) { func TestCursor_Get_KV(t *testing.T) { env := setup(t) - defer clean(env, t) var dbi DBI err := env.Update(func(txn *Txn) (err error) { @@ -279,7 +273,6 @@ func FromHex(in string) []byte { func TestDupCmpExcludeSuffix32(t *testing.T) { hash32Bytes := FromHex("56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421") env := setup(t) - defer clean(env, t) var dbi DBI err := env.Update(func(txn *Txn) (err error) { @@ -416,7 +409,6 @@ func TestDupCmpExcludeSuffix32(t *testing.T) { func TestCursor_Get_op_Set_bytesBuffer(t *testing.T) { env := setup(t) - defer clean(env, t) var dbi DBI err := env.Update(func(txn *Txn) (err error) { @@ -492,7 +484,6 @@ func TestCursor_Get_op_Set_bytesBuffer(t *testing.T) { func TestCursor_Get_DupFixed(t *testing.T) { env := setup(t) - defer clean(env, t) const datasize = 16 pagesize := os.Getpagesize() @@ -576,7 +567,6 @@ func TestCursor_Get_DupFixed(t *testing.T) { func TestCursor_Get_reverse(t *testing.T) { env := setup(t) - defer clean(env, t) var dbi DBI err := env.Update(func(txn *Txn) (err error) { @@ -634,7 +624,7 @@ func TestCursor_Get_reverse(t *testing.T) { //func TestCursor_PutMulti(t *testing.T) { // env := setup(t) -// defer clean(env, t) +// // // key := []byte("k") // items := [][]byte{ @@ -701,7 +691,6 @@ func TestCursor_Get_reverse(t *testing.T) { func TestCursor_Del(t *testing.T) { env := setup(t) - defer clean(env, t) var db DBI type Item struct{ k, v string } @@ -803,7 +792,6 @@ func TestCursor_Del(t *testing.T) { // This test verifies the behavior of Cursor.Count when DupSort is provided. func TestCursor_Count_DupSort(t *testing.T) { env := setup(t) - defer clean(env, t) var db DBI err := env.Update(func(txn *Txn) (err error) { @@ -858,7 +846,6 @@ func TestCursor_Count_DupSort(t *testing.T) { // on the database. func TestCursor_Count_noDupSort(t *testing.T) { env := setup(t) - defer clean(env, t) var db DBI err := env.Update(func(txn *Txn) (err error) { @@ -900,7 +887,6 @@ func TestCursor_Count_noDupSort(t *testing.T) { func TestCursor_Renew(t *testing.T) { env := setup(t) - defer clean(env, t) var db DBI err := env.Update(func(txn *Txn) (err error) { @@ -975,7 +961,6 @@ func TestCursor_Renew(t *testing.T) { func BenchmarkCursor(b *testing.B) { env := setup(b) - defer clean(env, b) var db DBI err := env.View(func(txn *Txn) (err error) { @@ -1011,7 +996,6 @@ func BenchmarkCursor(b *testing.B) { func BenchmarkCursor_Renew(b *testing.B) { env := setup(b) - defer clean(env, b) var cur *Cursor err := env.View(func(txn *Txn) (err error) { diff --git a/mdbx/dist/CMakeLists.txt b/mdbx/dist/CMakeLists.txt index e26873e..a36c0ba 100644 --- a/mdbx/dist/CMakeLists.txt +++ b/mdbx/dist/CMakeLists.txt @@ -79,7 +79,13 @@ elseif(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/VERSION" AND set(MDBX_AMALGAMATED_SOURCE TRUE) set(MDBX_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") else() - message(FATAL_ERROR "Please use libmdbx as a git-submodule or the amalgamated source code") + message(FATAL_ERROR "\n" + "Please don't use tarballs nor zips which are automatically provided by Github! " + "These archives do not contain version information and thus are unfit to build libmdbx. " + "You can vote for ability of disabling auto-creation such unsuitable archives at https://github.community/t/disable-tarball\n" + "Instead of above, just clone the git repository, either download a tarball or zip with the properly amalgamated source core. " + "For embedding libmdbx use a git-submodule or the amalgamated source code.\n" + "Please, avoid using any other techniques.") endif() if(DEFINED PROJECT_NAME) @@ -558,7 +564,7 @@ endmacro() macro(libmdbx_setup_libs TARGET MODE) target_link_libraries(${TARGET} ${MODE} Threads::Threads) if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") - target_link_libraries(${TARGET} ${MODE} ntdll.lib) + target_link_libraries(${TARGET} ${MODE} ntdll) if(MDBX_NTDLL_EXTRA_IMPLIB AND MDBX_WITHOUT_MSVC_CRT) target_link_libraries(${TARGET} ${MODE} ntdll_extra) endif() @@ -664,34 +670,34 @@ endif() # mdbx-shared-lib installation if(NOT DEFINED MDBX_DLL_INSTALL_DESTINATION) if(WIN32) - set(MDBX_DLL_INSTALL_DESTINATION bin) + set(MDBX_DLL_INSTALL_DESTINATION ${CMAKE_INSTALL_BINDIR}) else() - set(MDBX_DLL_INSTALL_DESTINATION lib) + set(MDBX_DLL_INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}) endif() endif() if(MDBX_BUILD_SHARED_LIBRARY) if(CMAKE_VERSION VERSION_LESS 3.12) install(TARGETS mdbx EXPORT libmdbx LIBRARY DESTINATION ${MDBX_DLL_INSTALL_DESTINATION} COMPONENT runtime - OBJECTS DESTINATION lib COMPONENT devel - ARCHIVE DESTINATION lib COMPONENT devel - PUBLIC_HEADER DESTINATION include COMPONENT devel - INCLUDES DESTINATION include COMPONENT devel) + OBJECTS DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT devel + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT devel + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} COMPONENT devel + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} COMPONENT devel) else() install(TARGETS mdbx EXPORT libmdbx LIBRARY DESTINATION ${MDBX_DLL_INSTALL_DESTINATION} COMPONENT runtime NAMELINK_COMPONENT devel - OBJECTS DESTINATION lib COMPONENT devel - ARCHIVE DESTINATION lib COMPONENT devel - PUBLIC_HEADER DESTINATION include COMPONENT devel - INCLUDES DESTINATION include COMPONENT devel) + OBJECTS DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT devel + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT devel + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} COMPONENT devel + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} COMPONENT devel) endif() endif(MDBX_BUILD_SHARED_LIBRARY) # mdbx-tools installation if(MDBX_BUILD_TOOLS) if(NOT DEFINED MDBX_TOOLS_INSTALL_DESTINATION) - set(MDBX_TOOLS_INSTALL_DESTINATION bin) + set(MDBX_TOOLS_INSTALL_DESTINATION ${CMAKE_INSTALL_BINDIR}) endif() install( TARGETS @@ -706,7 +712,7 @@ if(MDBX_BUILD_TOOLS) COMPONENT runtime) if(MDBX_INSTALL_MANPAGES) if(NOT DEFINED MDBX_MAN_INSTALL_DESTINATION) - set(MDBX_MAN_INSTALL_DESTINATION man/man1) + set(MDBX_MAN_INSTALL_DESTINATION ${CMAKE_INSTALL_MANDIR}/man1) endif() install( FILES @@ -725,26 +731,28 @@ endif(MDBX_BUILD_TOOLS) if(MDBX_INSTALL_STATIC) if(CMAKE_VERSION VERSION_LESS 3.12) install(TARGETS mdbx-static EXPORT libmdbx - LIBRARY DESTINATION lib COMPONENT devel - OBJECTS DESTINATION lib COMPONENT devel - ARCHIVE DESTINATION lib COMPONENT devel - PUBLIC_HEADER DESTINATION include COMPONENT devel - INCLUDES DESTINATION include COMPONENT devel) + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT devel + OBJECTS DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT devel + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT devel + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} COMPONENT devel + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} COMPONENT devel) else() install(TARGETS mdbx-static EXPORT libmdbx - LIBRARY DESTINATION lib COMPONENT devel + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT devel NAMELINK_COMPONENT devel - OBJECTS DESTINATION lib COMPONENT devel - ARCHIVE DESTINATION lib COMPONENT devel - PUBLIC_HEADER DESTINATION include COMPONENT devel - INCLUDES DESTINATION include COMPONENT devel) + OBJECTS DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT devel + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT devel + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} COMPONENT devel + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} COMPONENT devel) endif() endif(MDBX_INSTALL_STATIC) ################################################################################ # collect options & build info -string(TIMESTAMP MDBX_BUILD_TIMESTAMP UTC) +if(NOT DEFINED MDBX_BUILD_TIMESTAMP) + string(TIMESTAMP MDBX_BUILD_TIMESTAMP UTC) +endif() set(MDBX_BUILD_FLAGS ${CMAKE_C_FLAGS}) if(MDBX_BUILD_CXX) set(MDBX_BUILD_FLAGS ${CMAKE_CXX_FLAGS}) diff --git a/mdbx/dist/ChangeLog.md b/mdbx/dist/ChangeLog.md index 6d244d6..76a1416 100644 --- a/mdbx/dist/ChangeLog.md +++ b/mdbx/dist/ChangeLog.md @@ -1,19 +1,65 @@ ChangeLog --------- -## v0.10.1 (in development) +## v0.10.2 (in development) -TODO: +### TODO + - [Move most of `mdbx_chk` functional to the library API](https://github.com/erthink/libmdbx/issues/204). + - [Replace SRW-lock on Windows to allow shrink DB with `MDBX_NOTLS` option](https://github.com/erthink/libmdbx/issues/210). + - [More flexible support of asynchronous runtime/framework(s)](https://github.com/erthink/libmdbx/issues/200). + - [Migration guide from LMDB to MDBX](https://github.com/erthink/libmdbx/issues/199). - [Get rid of dirty-pages list in MDBX_WRITEMAP mode](https://github.com/erthink/libmdbx/issues/193). - [Large/Overflow pages accounting for dirty-room](https://github.com/erthink/libmdbx/issues/192). - [C++ Buffer issue](https://github.com/erthink/libmdbx/issues/191). - - Finalize C++ API (few typos and trivia bugs are still likely for now). - [Support for RAW devices](https://github.com/erthink/libmdbx/issues/124). - [Test framework issue](https://github.com/erthink/libmdbx/issues/127). - [Support MessagePack for Keys & Values](https://github.com/erthink/libmdbx/issues/115). - [Engage new terminology](https://github.com/erthink/libmdbx/issues/137). - - Packages for [Astra Linux](https://astralinux.ru/), [ALT Linux](https://www.altlinux.org/), [ROSA Linux](https://www.rosalinux.ru/), Fedora/RHEL, Debian/Ubuntu. + - Finalize C++ API (few typos and trivia bugs are still likely for now). + - Packages for [Astra Linux](https://astralinux.ru/), [ALT Linux](https://www.altlinux.org/), [ROSA Linux](https://www.rosalinux.ru/), etc. + +Acknowledgements: + + - [Alex Sharov](https://github.com/AskAlexSharov) for reporting and testing. + - [Andrea Lanfranchi](https://github.com/AndreaLanfranchi) for reporting bugs. + +New features: + + - [Allow to predefine/override `MDBX_BUILD_TIMESTAMP` for builds reproducibility](https://github.com/erthink/libmdbx/issues/201). + +Fixes: + + - Fixed excess meta-pages checks in case `mdbx_chk` is called to check the DB for a specific meta page and thus could prevent switching to the selected meta page, even if the check passed without errors. + - Fixed [recursive use of SRW-lock on Windows cause by `MDBX_NOTLS` option](https://github.com/erthink/libmdbx/issues/203). + - Fixed [log a warning during a new DB creation](https://github.com/erthink/libmdbx/issues/205). + - Fixed [false-negative `mdbx_cursor_eof()` result](https://github.com/erthink/libmdbx/issues/207). + - Fixed [`make install` with non-GNU `install` utility (OSX, BSD)](https://github.com/erthink/libmdbx/issues/208). + - Fixed [installation by `CMake` in special cases by complete use `GNUInstallDirs`'s variables](https://github.com/erthink/libmdbx/issues/209). + + +## v0.10.1 at 2021-06-01 + +Acknowledgements: + + - [Alexey Akhunov](https://github.com/AlexeyAkhunov) and [Alex Sharov](https://github.com/AskAlexSharov) for bug reporting and testing. + - [Andrea Lanfranchi](https://github.com/AndreaLanfranchi) for bug reporting and testing related to WSL2. + +New features: + + - Added `-p` option to `mdbx_stat` utility for printing page operations statistic. + - Added explicit checking for and warning about using unfit github's archives. + - Added fallback from [OFD locking](https://bit.ly/3yFRtYC) to legacy non-OFD POSIX file locks on an `EINVAL` error. + - Added [Plan 9](https://en.wikipedia.org/wiki/9P_(protocol)) network file system to the whitelist for an ability to open a DB in exclusive mode. + - Support for opening from WSL2 environment a DB hosted on Windows drive and mounted via [DrvFs](https://docs.microsoft.com/it-it/archive/blogs/wsl/wsl-file-system-support#drvfs) (i.e by Plan 9 noted above). + +Fixes: + + - Fixed minor "foo not used" warnings from modern C++ compilers when building the C++ part of the library. + - Fixed confusing/messy errors when build library from unfit github's archives (https://github.com/erthink/libmdbx/issues/197). + - Fixed `#​e​l​s​i​f` typo. + - Fixed rare unexpected `MDBX_PROBLEM` error during altering data in huge transactions due to wrong spilling/oust of dirty pages (https://github.com/erthink/libmdbx/issues/195). + - Re-Fixed WSL1/WSL2 detection with distinguishing (https://github.com/erthink/libmdbx/issues/97). ## v0.10.0 at 2021-05-09 diff --git a/mdbx/dist/GNUmakefile b/mdbx/dist/GNUmakefile index 8f170e7..3836dd1 100644 --- a/mdbx/dist/GNUmakefile +++ b/mdbx/dist/GNUmakefile @@ -29,7 +29,8 @@ INSTALL ?= install CC ?= gcc CFLAGS_EXTRA ?= LD ?= ld -MDBX_BUILD_OPTIONS ?= -DNDEBUG=1 +MDBX_BUILD_OPTIONS ?=-DNDEBUG=1 +MDBX_BUILD_TIMESTAMP ?=$(shell date +%Y-%m-%dT%H:%M:%S%z) CFLAGS ?= -std=gnu11 -O2 -g -Wall -Werror -Wextra -Wpedantic -ffunction-sections -fPIC -fvisibility=hidden -pthread -Wno-error=attributes $(CFLAGS_EXTRA) # -Wno-tautological-compare CXX ?= g++ @@ -37,7 +38,7 @@ CXX ?= g++ CXXSTD ?= $(eval CXXSTD := $$(shell PROBE=$$$$([ -f mdbx.c++ ] && echo mdbx.c++ || echo src/mdbx.c++); for std in gnu++20 c++20 gnu++2a c++2a gnu++17 c++17 gnu++14 c++14 gnu+11 c++11; do $(CXX) -std=$$$${std} -c $$$${PROBE} -o /dev/null 2>/dev/null >/dev/null && echo "-std=$$$${std}" && exit; done))$(CXXSTD) CXXFLAGS = $(CXXSTD) $(filter-out -std=gnu11,$(CFLAGS)) -# HINT: Try append '--no-as-needed,-lrt' for ability to built with modern glibc, but then run with the old. +# TIP: Try append '--no-as-needed,-lrt' for ability to built with modern glibc, but then use with the old. LIBS ?= $(shell uname | grep -qi SunOS && echo "-lkstat") $(shell uname | grep -qi -e Darwin -e OpenBSD || echo "-lrt") $(shell uname | grep -qi Windows && echo "-lntdll") LDFLAGS ?= $(shell $(LD) --help 2>/dev/null | grep -q -- --gc-sections && echo '-Wl,--gc-sections,-z,relro,-O1')$(shell $(LD) --help 2>/dev/null | grep -q -- -dead_strip && echo '-Wl,-dead_strip') @@ -102,7 +103,8 @@ help: @echo " make bench-clean - remove temp database(s) after benchmark" show-options: - @echo " MDBX_BUILD_OPTIONS =$(MDBX_BUILD_OPTIONS)" + @echo " MDBX_BUILD_OPTIONS = $(MDBX_BUILD_OPTIONS)" + @echo " MDBX_BUILD_TIMESTAMP = $(MDBX_BUILD_TIMESTAMP)" @echo '## TIP: Use `make options` to listing available build options.' @echo " CFLAGS =$(CFLAGS)" @echo " CXXFLAGS =$(CXXFLAGS)" @@ -128,7 +130,8 @@ options: @echo " EXE_LDFLAGS =$(EXE_LDFLAGS)" @echo " LIBS =$(LIBS)" @echo "" - @echo " MDBX_BUILD_OPTIONS =$(MDBX_BUILD_OPTIONS)" + @echo " MDBX_BUILD_OPTIONS = $(MDBX_BUILD_OPTIONS)" + @echo " MDBX_BUILD_TIMESTAMP = $(MDBX_BUILD_TIMESTAMP)" @echo "" @echo "## Assortment items for MDBX_BUILD_OPTIONS:" @echo "## Note that the defaults should already be correct for most platforms;" @@ -165,7 +168,7 @@ MAN_SRCDIR := man1/ config.h: mdbx.c $(lastword $(MAKEFILE_LIST)) @echo ' MAKE $@' - $(QUIET)(echo '#define MDBX_BUILD_TIMESTAMP "$(shell date +%Y-%m-%dT%H:%M:%S%z)"' \ + $(QUIET)(echo '#define MDBX_BUILD_TIMESTAMP "$(MDBX_BUILD_TIMESTAMP)"' \ && echo '#define MDBX_BUILD_FLAGS "$(CXXSTD) $(CFLAGS) $(LDFLAGS) $(LIBS)"' \ && echo '#define MDBX_BUILD_COMPILER "$(shell (LC_ALL=C $(CC) --version || echo 'Please use GCC or CLANG compatible compiler') | head -1)"' \ && echo '#define MDBX_BUILD_TARGET "$(shell set -o pipefail; (LC_ALL=C $(CC) -v 2>&1 | grep -i '^Target:' | cut -d ' ' -f 2- || (LC_ALL=C $(CC) --version | grep -qi e2k && echo E2K) || echo 'Please use GCC or CLANG compatible compiler') | head -1)"' \ @@ -194,11 +197,16 @@ mdbx_%: mdbx_%.c libmdbx.a install: $(LIBRARIES) $(TOOLS) $(HEADERS) @echo ' INSTALLING...' - $(INSTALL) -D -p $(EXE_INSTALL_FLAGS) -t $(DESTDIR)$(prefix)/bin$(suffix) $(TOOLS) && \ - $(INSTALL) -D -p $(EXE_INSTALL_FLAGS) -t $(DESTDIR)$(prefix)/lib$(suffix) $(filter-out libmdbx.a,$(LIBRARIES)) && \ - $(INSTALL) -D -p -t $(DESTDIR)$(prefix)/lib$(suffix) libmdbx.a && \ - $(INSTALL) -D -p -m 444 -t $(DESTDIR)$(prefix)/include $(HEADERS) && \ - $(INSTALL) -D -p -m 444 -t $(DESTDIR)$(mandir)/man1 $(addprefix $(MAN_SRCDIR), $(MANPAGES)) + $(QUIET)mkdir -p $(DESTDIR)$(prefix)/bin$(suffix) && \ + $(INSTALL) -p $(EXE_INSTALL_FLAGS) $(TOOLS) $(DESTDIR)$(prefix)/bin$(suffix)/ && \ + mkdir -p $(DESTDIR)$(prefix)/lib$(suffix)/ && \ + $(INSTALL) -p $(EXE_INSTALL_FLAGS) $(filter-out libmdbx.a,$(LIBRARIES)) $(DESTDIR)$(prefix)/lib$(suffix)/ && \ + mkdir -p $(DESTDIR)$(prefix)/lib$(suffix)/ && \ + $(INSTALL) -p libmdbx.a $(DESTDIR)$(prefix)/lib$(suffix)/ && \ + mkdir -p $(DESTDIR)$(prefix)/include/ && \ + $(INSTALL) -p -m 444 $(HEADERS) $(DESTDIR)$(prefix)/include/ && \ + mkdir -p $(DESTDIR)$(mandir)/man1/ && \ + $(INSTALL) -p -m 444 $(addprefix $(MAN_SRCDIR), $(MANPAGES)) $(DESTDIR)$(mandir)/man1/ install-strip: EXE_INSTALL_FLAGS = -s install-strip: install @@ -219,7 +227,8 @@ uninstall: IOARENA ?= $(shell \ (test -x ../ioarena/@BUILD/src/ioarena && echo ../ioarena/@BUILD/src/ioarena) || \ (test -x ../../@BUILD/src/ioarena && echo ../../@BUILD/src/ioarena) || \ - (test -x ../../src/ioarena && echo ../../src/ioarena) || which ioarena) + (test -x ../../src/ioarena && echo ../../src/ioarena) || which ioarena 2>&- || \ + echo '\#\# TIP: Clone and build the https://github.com/pmwkaa/ioarena.git within a neighbouring directory for availability of benchmarking.' >&2) NN ?= 25000000 BENCH_CRUD_MODE ?= nosync diff --git a/mdbx/dist/README.md b/mdbx/dist/README.md index e176bcf..02131b3 100644 --- a/mdbx/dist/README.md +++ b/mdbx/dist/README.md @@ -154,8 +154,8 @@ transaction journal. No crash recovery needed. No maintenance is required. ## Limitations - **Page size**: a power of 2, minimum `256` (mostly for testing), maximum `65536` bytes, default `4096` bytes. -- **Key size**: minimum `0`, maximum ≈¼ pagesize (`1348` bytes for default 4K pagesize, `21828` bytes for 64K pagesize). -- **Value size**: minimum `0`, maximum `2146435072` (`0x7FF00000`) bytes for maps, ≈¼ pagesize for multimaps (`1348` bytes for default 4K pagesize, `21828` bytes for 64K pagesize). +- **Key size**: minimum `0`, maximum ≈½ pagesize (`2022` bytes for default 4K pagesize, `32742` bytes for 64K pagesize). +- **Value size**: minimum `0`, maximum `2146435072` (`0x7FF00000`) bytes for maps, ≈½ pagesize for multimaps (`2022` bytes for default 4K pagesize, `32742` bytes for 64K pagesize). - **Write transaction size**: up to `1327217884` pages (`4.944272` TiB for default 4K pagesize, `79.108351` TiB for 64K pagesize). - **Database size**: up to `2147483648` pages (≈`8.0` TiB for default 4K pagesize, ≈`128.0` TiB for 64K pagesize). - **Maximum sub-databases**: `32765`. @@ -200,8 +200,8 @@ the user's point of view. ## Added Features 1. Keys could be more than 2 times longer than _LMDB_. - > For DB with default page size _libmdbx_ support keys up to 1300 bytes - > and up to 21780 bytes for 64K page size. _LMDB_ allows key size up to + > For DB with default page size _libmdbx_ support keys up to 2022 bytes + > and up to 32742 bytes for 64K page size. _LMDB_ allows key size up to > 511 bytes and may silently loses data with large values. 2. Up to 30% faster than _LMDB_ in [CRUD](https://en.wikipedia.org/wiki/Create,_read,_update_and_delete) benchmarks. @@ -343,6 +343,13 @@ Currently, libmdbx is only available in a Packages support for common Linux distributions is planned in the future, since release the version 1.0. +## Never use tarballs nor zips automatically provided by Github ! + +Please don't use tarballs nor zips which are automatically provided by Github. +These archives do not contain version information and thus are unfit to build _libmdbx_. +Instead of ones just clone the git repository, either download a tarball or zip with the properly amalgamated source core. +Moreover, please vote for [ability of disabling auto-creation such unsuitable archives](https://github.community/t/disable-tarball). + ## Source code embedding _libmdbx_ provides two official ways for integration in source code form: @@ -380,6 +387,18 @@ So just using CMake or GNU Make in your habitual manner and feel free to fill an issue or make pull request in the case something will be unexpected or broken down. +### Common important details + +#### Build reproducibility +By default _libmdbx_ track build time via `MDBX_BUILD_TIMESTAMP` build option and macro. +So for a [reproducible builds](https://en.wikipedia.org/wiki/Reproducible_builds) you should predefine/override it to known fixed string value. For instance: + + - for reproducible build with make: `make MDBX_BUILD_TIMESTAMP=unknown ` ... + - or during configure by CMake: `cmake -DMDBX_BUILD_TIMESTAMP:STRING=unknown ` ... + +Of course, in addition to this, your toolchain must ensure the reproducibility of builds. +For more information please refer to [reproducible-builds.org](https://reproducible-builds.org/). + #### DSO/DLL unloading and destructors of Thread-Local-Storage objects When building _libmdbx_ as a shared library or use static _libmdbx_ as a part of another dynamic library, it is advisable to make sure that your @@ -472,7 +491,7 @@ Please refer to the [official guide](https://developer.android.com/studio/projec ### iOS To build _libmdbx_ for iOS, we recommend using CMake with the -"[toolchain file](https://cmake.org/cmake/help/latest/variable/CMAKE_TOOLCHAIN_FILE.html)" +["toolchain file"](https://cmake.org/cmake/help/latest/variable/CMAKE_TOOLCHAIN_FILE.html) from the [ios-cmake](https://github.com/leetal/ios-cmake) project. diff --git a/mdbx/dist/VERSION b/mdbx/dist/VERSION index b8d6a6d..3b00a3e 100644 --- a/mdbx/dist/VERSION +++ b/mdbx/dist/VERSION @@ -1 +1 @@ -0.10.0.0 +0.10.1.15 diff --git a/mdbx/dist/man1/mdbx_stat.1 b/mdbx/dist/man1/mdbx_stat.1 index c52396b..b259cc3 100644 --- a/mdbx/dist/man1/mdbx_stat.1 +++ b/mdbx/dist/man1/mdbx_stat.1 @@ -12,6 +12,8 @@ mdbx_stat \- MDBX environment status tool [\c .BR \-q ] [\c +.BR \-p ] +[\c .BR \-e ] [\c .BR \-f [ f [ f ]]] @@ -35,13 +37,18 @@ Write the library version number to the standard output, and exit. .BR \-q Be quiet. .TP +.BR \-p +Display overall statistics of page operations of all (running, completed +and aborted) transactions in the current multi-process session (since the +first process opened the database after everyone had previously closed it). +.TP .BR \-e Display information about the database environment. .TP .BR \-f -Display information about the environment freelist. -If \fB\-ff\fP is given, summarize each freelist entry. -If \fB\-fff\fP is given, display the full list of page IDs in the freelist. +Display information about the environment GC. +If \fB\-ff\fP is given, summarize each GC/freelist entry. +If \fB\-fff\fP is given, display the full list of page IDs in the GC/freelist. .TP .BR \-r Display information about the environment reader table. diff --git a/mdbx/dist/mdbx.c b/mdbx/dist/mdbx.c index 9f63ed4..06af057 100644 --- a/mdbx/dist/mdbx.c +++ b/mdbx/dist/mdbx.c @@ -12,7 +12,7 @@ * . */ #define xMDBX_ALLOY 1 -#define MDBX_BUILD_SOURCERY 10aa116f5f6a1fca4ccea1310d3d331a39161abc5b63b6a30e01812eab671e7c_v0_10_0_0_gaa1f6fbd +#define MDBX_BUILD_SOURCERY 220eee3b3a4b48cb20897d772c6665de65226e5811d687b6f7f43b2beeb9cb31_v0_10_1_15_g63e7276c #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -206,14 +206,6 @@ # endif #endif /* __must_check_result */ -#ifndef __maybe_unused -# if defined(__GNUC__) || __has_attribute(__unused__) -# define __maybe_unused __attribute__((__unused__)) -# else -# define __maybe_unused -# endif -#endif /* __maybe_unused */ - #if !defined(__noop) && !defined(_MSC_VER) # define __noop(...) do {} while(0) #endif /* __noop */ @@ -359,7 +351,7 @@ #endif /* __anonymous_struct_extension__ */ #ifndef __Wpedantic_format_voidptr - static __inline __maybe_unused const void* MDBX_PURE_FUNCTION + MDBX_MAYBE_UNUSED MDBX_PURE_FUNCTION static __inline const void* __Wpedantic_format_voidptr(const void* ptr) {return ptr;} # define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG) #endif /* __Wpedantic_format_voidptr */ @@ -971,7 +963,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t mdbx_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) SYSTEM_INFO si; @@ -1058,7 +1050,7 @@ extern void mdbx_osal_jitter(bool tiny); #include #endif -static __maybe_unused __inline void mdbx_compiler_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) __asm__ __volatile__("" ::: "memory"); #elif defined(_MSC_VER) @@ -1078,7 +1070,7 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { #endif } -static __maybe_unused __inline void mdbx_memory_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) @@ -1118,8 +1110,8 @@ static __maybe_unused __inline void mdbx_memory_barrier(void) { #define mdbx_asprintf asprintf #define mdbx_vasprintf vasprintf #else -MDBX_INTERNAL_FUNC MDBX_PRINTF_ARGS(2, 3) int __maybe_unused - mdbx_asprintf(char **strp, const char *fmt, ...); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC + MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...); MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #endif @@ -1142,7 +1134,7 @@ MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; LIBMDBX_API char *mdbx_strdup(const char *str); #endif -static __maybe_unused __inline int mdbx_get_errno(void) { +MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { #if defined(_WIN32) || defined(_WIN64) DWORD rc = GetLastError(); #else @@ -1226,8 +1218,10 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, const size_t must, const size_t limit, const unsigned options); MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); -MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, - size_t wanna, const bool may_move); +#define MDBX_MRESIZE_MAY_MOVE 0x00000100 +#define MDBX_MRESIZE_MAY_UNMAP 0x00000200 +MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, + size_t size, size_t limit); #if defined(_WIN32) || defined(_WIN64) typedef struct { unsigned limit, count; @@ -1244,7 +1238,7 @@ MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, const char *pathname, int err); -static __maybe_unused __inline uint32_t mdbx_getpid(void) { +MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentProcessId(); @@ -1253,7 +1247,7 @@ static __maybe_unused __inline uint32_t mdbx_getpid(void) { #endif } -static __maybe_unused __inline uintptr_t mdbx_thread_self(void) { +MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { mdbx_tid_t thunk; STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); #if defined(_WIN32) || defined(_WIN64) @@ -1264,7 +1258,7 @@ static __maybe_unused __inline uintptr_t mdbx_thread_self(void) { return (uintptr_t)thunk; } -MDBX_INTERNAL_FUNC void __maybe_unused mdbx_osal_jitter(bool tiny); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny); MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void); MDBX_INTERNAL_FUNC uint64_t mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); @@ -2035,7 +2029,7 @@ static __always_inline memory_order mo_c11_load(enum MDBX_memory_order fence) { static __inline void mdbx_jitter4testing(bool tiny); -static __maybe_unused __always_inline void +MDBX_MAYBE_UNUSED static __always_inline void mdbx_memory_fence(enum MDBX_memory_order order, bool write) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(write ? mo_c11_store(order) : mo_c11_load(order)); @@ -2047,7 +2041,7 @@ mdbx_memory_fence(enum MDBX_memory_order order, bool write) { #endif /* MDBX_HAVE_C11ATOMICS */ } -static __maybe_unused __always_inline uint32_t +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); @@ -2063,7 +2057,7 @@ atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, return value; } -static __maybe_unused __always_inline uint32_t +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); #ifdef MDBX_HAVE_C11ATOMICS @@ -2078,7 +2072,7 @@ atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { #endif /* MDBX_HAVE_C11ATOMICS */ } -static __maybe_unused __always_inline uint64_t +MDBX_MAYBE_UNUSED static __always_inline uint64_t atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8); @@ -2102,7 +2096,7 @@ atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, return value; } -static __maybe_unused +MDBX_MAYBE_UNUSED static #if MDBX_64BIT_ATOMIC __always_inline #endif /* MDBX_64BIT_ATOMIC */ @@ -2690,15 +2684,7 @@ typedef struct MDBX_dbx { * Every operation requires a transaction handle. */ struct MDBX_txn { #define MDBX_MT_SIGNATURE UINT32_C(0x93D53A31) - size_t mt_signature; - MDBX_txn *mt_parent; /* parent of a nested txn */ - /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ - MDBX_txn *mt_child; - MDBX_geo mt_geo; - /* next unallocated page */ -#define mt_next_pgno mt_geo.next - /* corresponding to the current size of datafile */ -#define mt_end_pgno mt_geo.now + uint32_t mt_signature; /* Transaction Flags */ /* mdbx_txn_begin() flags */ @@ -2727,8 +2713,17 @@ struct MDBX_txn { MDBX_SHRINK_ALLOWED) #error "Oops, some flags overlapped or wrong" #endif + uint32_t mt_flags; + + MDBX_txn *mt_parent; /* parent of a nested txn */ + /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ + MDBX_txn *mt_child; + MDBX_geo mt_geo; + /* next unallocated page */ +#define mt_next_pgno mt_geo.next + /* corresponding to the current size of datafile */ +#define mt_end_pgno mt_geo.now - unsigned mt_flags; /* The ID of this transaction. IDs are integers incrementing from 1. * Only committed write transactions increment the ID. If a transaction * aborts, the ID may be re-used by the next writer. */ @@ -3148,7 +3143,7 @@ void mdbx_assert_fail(const MDBX_env *env, const char *msg, const char *func, #define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier() #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ -static __maybe_unused __inline void +MDBX_MAYBE_UNUSED static __inline void mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); @@ -3190,7 +3185,7 @@ MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void); MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void); MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); -static __maybe_unused __inline void mdbx_jitter4testing(bool tiny) { +MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) { #if MDBX_DEBUG if (MDBX_DBG_JITTER & mdbx_runtime_flags) mdbx_osal_jitter(tiny); @@ -3343,35 +3338,35 @@ typedef struct MDBX_node { /* Do not spill pages to disk if txn is getting full, may fail instead */ #define MDBX_NOSPILL 0x8000 -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline pgno_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_add(pgno_t base, pgno_t augend) { assert(base <= MAX_PAGENO); return (augend < MAX_PAGENO - base) ? base + augend : MAX_PAGENO; } -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline pgno_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_sub(pgno_t base, pgno_t subtrahend) { assert(base >= MIN_PAGENO); return (subtrahend < base - MIN_PAGENO) ? base - subtrahend : MIN_PAGENO; } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused bool +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool is_powerof2(size_t x) { return (x & (x - 1)) == 0; } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t floor_powerof2(size_t value, size_t granularity) { assert(is_powerof2(granularity)); return value & ~(granularity - 1); } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t ceil_powerof2(size_t value, size_t granularity) { return floor_powerof2(value + granularity - 1, granularity); } -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused unsigned +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned log2n_powerof2(size_t value) { assert(value > 0 && value < INT32_MAX && is_powerof2(value)); assert((value & -(int32_t)value) == value); @@ -3401,7 +3396,7 @@ log2n_powerof2(size_t value) { #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS -static __maybe_unused void static_checks(void) { +MDBX_MAYBE_UNUSED static void static_checks(void) { STATIC_ASSERT_MSG(INT16_MAX - CORE_DBS == MDBX_MAX_DBI, "Oops, MDBX_MAX_DBI or CORE_DBS?"); STATIC_ASSERT_MSG((unsigned)(MDBX_DB_ACCEDE | MDBX_CREATE) == @@ -3520,7 +3515,7 @@ MDBX_NOTHROW_CONST_FUNCTION static uint16_t pages2pv(size_t pages) { /*------------------------------------------------------------------------------ * Unaligned access */ -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __always_inline unsigned +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline unsigned field_alignment(unsigned alignment_baseline, size_t field_offset) { unsigned merge = alignment_baseline | (unsigned)field_offset; return merge & -(int)merge; @@ -4035,7 +4030,7 @@ page_used(const MDBX_env *env, const MDBX_page *mp) { } /* The percentage of space used in the page, in a percents. */ -MDBX_NOTHROW_PURE_FUNCTION static __maybe_unused __inline double +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __inline double page_fill(const MDBX_env *env, const MDBX_page *mp) { return page_used(env, mp) * 100.0 / page_space(env); } @@ -4441,7 +4436,7 @@ static __always_inline uint64_t safe64_read(const MDBX_atomic_uint64_t *p) { } #if 0 /* unused for now */ - static __maybe_unused __always_inline bool safe64_is_valid(uint64_t v) { +MDBX_MAYBE_UNUSED static __always_inline bool safe64_is_valid(uint64_t v) { #if MDBX_WORDBITS >= 64 return v < SAFE64_INVALID_THRESHOLD; #else @@ -4449,7 +4444,7 @@ static __always_inline uint64_t safe64_read(const MDBX_atomic_uint64_t *p) { #endif /* MDBX_WORDBITS */ } - static __maybe_unused __always_inline bool +MDBX_MAYBE_UNUSED static __always_inline bool safe64_is_valid_ptr(const MDBX_atomic_uint64_t *p) { #if MDBX_64BIT_ATOMIC return atomic_load64(p, mo_AcquireRelease) < SAFE64_INVALID_THRESHOLD; @@ -4471,7 +4466,7 @@ static __always_inline void safe64_update(MDBX_atomic_uint64_t *p, } /* non-atomic increment with safety for reading a half-updated value */ -static __maybe_unused +MDBX_MAYBE_UNUSED static #if MDBX_64BIT_ATOMIC __always_inline #endif /* MDBX_64BIT_ATOMIC */ @@ -4923,7 +4918,10 @@ static int uniq_peek(const mdbx_mmap_t *pending, mdbx_mmap_t *scan) { rc = MDBX_SUCCESS; } else { bait = 0 /* hush MSVC warning */; - rc = mdbx_msync(scan, 0, sizeof(MDBX_lockinfo), true); +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&scan_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = mdbx_msync(scan, 0, sizeof(MDBX_lockinfo), MDBX_SYNC_DATA); if (rc == MDBX_SUCCESS) rc = mdbx_pread(pending->fd, &bait, sizeof(scan_lck->mti_bait_uniqueness), offsetof(MDBX_lockinfo, mti_bait_uniqueness)); @@ -4979,7 +4977,11 @@ __cold static int uniq_check(const mdbx_mmap_t *pending, MDBX_env **found) { if (err == MDBX_RESULT_TRUE) err = uniq_poke(pending, &scan->me_lck_mmap, &salt); if (err == MDBX_RESULT_TRUE) { - (void)mdbx_msync(&scan->me_lck_mmap, 0, sizeof(MDBX_lockinfo), false); +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&scan->me_lck_mmap.lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ + (void)mdbx_msync(&scan->me_lck_mmap, 0, sizeof(MDBX_lockinfo), + MDBX_SYNC_NONE); err = uniq_poke(pending, &scan->me_lck_mmap, &salt); } if (err == MDBX_RESULT_TRUE) { @@ -6869,8 +6871,8 @@ static __always_inline unsigned mdbx_dpl_exist(MDBX_txn *txn, pgno_t pgno) { return (dl->items[i].pgno == pgno) ? i : 0; } -static __maybe_unused const MDBX_page *debug_dpl_find(const MDBX_txn *txn, - const pgno_t pgno) { +MDBX_MAYBE_UNUSED static const MDBX_page *debug_dpl_find(const MDBX_txn *txn, + const pgno_t pgno) { const MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); for (unsigned i = dl->length; i > dl->sorted; --i) @@ -6905,8 +6907,10 @@ mdbx_dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page, unsigned npages) { if (mdbx_audit_enabled()) { for (unsigned i = dl->length; i > 0; --i) { assert(dl->items[i].pgno != pgno); - if (unlikely(dl->items[i].pgno == pgno)) + if (unlikely(dl->items[i].pgno == pgno)) { + mdbx_error("Page %u already exist in the DPL at %u", pgno, i); return MDBX_PROBLEM; + } } } @@ -6943,6 +6947,13 @@ mdbx_dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page, unsigned npages) { return MDBX_SUCCESS; } +static __inline uint32_t mdbx_dpl_age(const MDBX_txn *txn, unsigned i) { + const MDBX_dpl *dl = txn->tw.dirtylist; + assert((int)i > 0 && i <= dl->length); + /* overflow could be here */ + return (txn->tw.dirtylru - dl->items[i].lru) & UINT32_C(0x7fffFFFF); +} + /*----------------------------------------------------------------------------*/ uint8_t mdbx_runtime_flags = MDBX_RUNTIME_FLAGS_INIT; @@ -7025,7 +7036,8 @@ static int __must_check_result mdbx_page_split(MDBX_cursor *mc, static int __must_check_result mdbx_read_header(MDBX_env *env, MDBX_meta *meta, uint64_t *filesize, - const int lck_exclusive); + const int lck_exclusive, + const mdbx_mode_t mode_bits); static int __must_check_result mdbx_sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending); static int mdbx_env_close0(MDBX_env *env); @@ -7378,7 +7390,7 @@ static const char *mdbx_leafnode_type(MDBX_node *n) { } /* Display all the keys in the page. */ -static __maybe_unused void mdbx_page_list(MDBX_page *mp) { +MDBX_MAYBE_UNUSED static void mdbx_page_list(MDBX_page *mp) { pgno_t pgno = mp->mp_pgno; const char *type; MDBX_node *node; @@ -7468,7 +7480,7 @@ static __maybe_unused void mdbx_page_list(MDBX_page *mp) { (mc)->mc_xcursor->mx_cursor.mc_pg[0] = node_data(xr_node); \ } while (0) -static __maybe_unused bool cursor_is_tracked(const MDBX_cursor *mc) { +MDBX_MAYBE_UNUSED static bool cursor_is_tracked(const MDBX_cursor *mc) { for (MDBX_cursor *scan = mc->mc_txn->tw.cursors[mc->mc_dbi]; scan; scan = scan->mc_next) if (mc == ((mc->mc_flags & C_SUB) ? &scan->mc_xcursor->mx_cursor : scan)) @@ -7556,7 +7568,7 @@ static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) { static void mdbx_dpage_free(MDBX_env *env, MDBX_page *dp, unsigned npages) { VALGRIND_MAKE_MEM_UNDEFINED(dp, pgno2bytes(env, npages)); ASAN_UNPOISON_MEMORY_REGION(dp, pgno2bytes(env, npages)); - if (MDBX_DEBUG || unlikely(env->me_flags & MDBX_PAGEPERTURB)) + if (MDBX_DEBUG != 0 || unlikely(env->me_flags & MDBX_PAGEPERTURB)) memset(dp, -1, pgno2bytes(env, npages)); if (npages == 1 && env->me_dp_reserve_len < env->me_options.dp_reserve_limit) { @@ -7595,7 +7607,7 @@ static __always_inline MDBX_db *mdbx_outer_db(MDBX_cursor *mc) { return couple->outer.mc_db; } -static __cold __maybe_unused bool mdbx_dirtylist_check(MDBX_txn *txn) { +MDBX_MAYBE_UNUSED __cold static bool mdbx_dirtylist_check(MDBX_txn *txn) { const MDBX_dpl *const dl = txn->tw.dirtylist; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); mdbx_tassert(txn, txn->tw.dirtyroom + dl->length == @@ -7615,8 +7627,9 @@ static __cold __maybe_unused bool mdbx_dirtylist_check(MDBX_txn *txn) { if (unlikely(dp->mp_pgno != dl->items[i].pgno)) return false; - mdbx_tassert(txn, txn->tw.dirtylru >= dl->items[i].lru); - if (unlikely(txn->tw.dirtylru < dl->items[i].lru)) + const uint32_t age = mdbx_dpl_age(txn, i); + mdbx_tassert(txn, age < UINT32_MAX / 3); + if (unlikely(age > UINT32_MAX / 3)) return false; mdbx_tassert(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); @@ -8142,7 +8155,7 @@ static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, ? pgno + 2 : txn->tw.loose_refund_wl; #endif /* MDBX_ENABLE_REFUND */ - if (MDBX_DEBUG || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) + if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) memset(page_data(mp), -1, txn->mt_env->me_psize - PAGEHDRSZ); VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), txn->mt_env->me_psize - PAGEHDRSZ); @@ -8170,7 +8183,7 @@ static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, } #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - if (MDBX_DEBUG || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) + if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) #endif mdbx_kill_page(txn, mp, pgno, npages); if (!(txn->mt_flags & MDBX_WRITEMAP)) { @@ -8384,58 +8397,33 @@ static int spill_page(MDBX_txn *txn, struct mdbx_iov_ctx *ctx, MDBX_page *dp, * Returns the number of pages marked as unspillable. */ static unsigned mdbx_cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { unsigned keep = 0; - if (!(mc->mc_flags & C_INITIALIZED)) - return keep; - -loop:; - const MDBX_page *mp = NULL; - for (unsigned i = 0; i < mc->mc_snum; i++) { - mp = mc->mc_pg[i]; - if (IS_MODIFIABLE(txn, mp)) { - unsigned const n = mdbx_dpl_search(txn, mp->mp_pgno); - if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && - txn->tw.dirtylist->items[n].lru != txn->tw.dirtylru) { - txn->tw.dirtylist->items[n].lru = txn->tw.dirtylru; - keep++; + while (mc->mc_flags & C_INITIALIZED) { + for (unsigned i = 0; i < mc->mc_snum; ++i) { + const MDBX_page *mp = mc->mc_pg[i]; + if (IS_MODIFIABLE(txn, mp) && !IS_SUBP(mp)) { + unsigned const n = mdbx_dpl_search(txn, mp->mp_pgno); + if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && + mdbx_dpl_age(txn, n)) { + txn->tw.dirtylist->items[n].lru = txn->tw.dirtylru; + ++keep; + } } } - } - if (!(mp && IS_LEAF(mp))) - return keep; - - /* Proceed to mx if it is at a sub-database */ - MDBX_xcursor *mx = mc->mc_xcursor; - if (!(mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) - return keep; - - const unsigned nkeys = page_numkeys(mp); - unsigned ki = mc->mc_ki[mc->mc_top]; - mdbx_cassert(mc, nkeys > 0 && - (ki < nkeys || - (ki == nkeys && (mx->mx_cursor.mc_flags & C_EOF)))); - ki -= ki >= nkeys; - if ((node_flags(page_node(mp, ki)) & F_SUBDATA)) { - mc = &mx->mx_cursor; - goto loop; + if (!mc->mc_xcursor) + break; + mc = &mc->mc_xcursor->mx_cursor; } return keep; } static unsigned mdbx_txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { unsigned keep = m0 ? mdbx_cursor_keep(txn, m0) : 0; - - for (unsigned i = FREE_DBI; i < txn->mt_numdbs; ++i) { - const pgno_t pgno = txn->mt_dbs[i].md_root; - if ((txn->mt_dbistate[i] & DBI_DIRTY) && pgno != P_INVALID) { - unsigned const n = mdbx_dpl_search(txn, pgno); - if (likely(txn->tw.dirtylist->items[n].pgno == pgno)) { - txn->tw.dirtylist->items[n].lru = txn->tw.dirtylru; - for (MDBX_cursor *mc = txn->tw.cursors[i]; mc; mc = mc->mc_next) - if (mc != m0) - keep += mdbx_cursor_keep(txn, mc); - } - } - } + for (unsigned i = FREE_DBI; i < txn->mt_numdbs; ++i) + if (F_ISSET(txn->mt_dbistate[i], DBI_DIRTY | DBI_VALID) && + txn->mt_dbs[i].md_root != P_INVALID) + for (MDBX_cursor *mc = txn->tw.cursors[i]; mc; mc = mc->mc_next) + if (mc != m0) + keep += mdbx_cursor_keep(txn, mc); return keep; } @@ -8444,12 +8432,12 @@ static unsigned mdbx_txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { * ... * > 255 = must not be spilled. */ static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, - const unsigned lru_min, const unsigned reciprocal) { + const uint32_t reciprocal) { MDBX_dpl *const dl = txn->tw.dirtylist; - const unsigned lru = dl->items[i].lru; + const uint32_t age = mdbx_dpl_age(txn, i); const unsigned npages = dpl_npages(dl, i); const pgno_t pgno = dl->items[i].pgno; - if (lru == txn->tw.dirtylru) { + if (age == 0) { mdbx_debug("skip %s %u page %" PRIaPGNO, "keep", npages, pgno); return 256; } @@ -8478,21 +8466,22 @@ static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, while ((parent = parent->mt_parent) != nullptr); } - unsigned prio = 1 + ((lru - lru_min) * reciprocal >> 8); - mdbx_tassert(txn, prio > 0 && prio < 256); + mdbx_tassert(txn, age * (uint64_t)reciprocal < UINT32_MAX); + unsigned prio = age * reciprocal >> 24; + mdbx_tassert(txn, prio < 256); if (likely(npages == 1)) - return prio; + return prio = 256 - prio; /* make a large/overflow pages be likely to spill */ - uint32_t x = npages | npages >> 1; - x |= x >> 2; - x |= x >> 4; - x |= x >> 8; - x |= x >> 16; - x = (255 - prio) * log2n_powerof2(x + 1) + /* golden ratio factor */ 157; - x = (x < 256) ? 255 - x : 0; - mdbx_tassert(txn, x < 256 && x < prio); - return prio = x; + uint32_t factor = npages | npages >> 1; + factor |= factor >> 2; + factor |= factor >> 4; + factor |= factor >> 8; + factor |= factor >> 16; + factor = prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157; + factor = (factor < 256) ? 255 - factor : 0; + mdbx_tassert(txn, factor < 256 && factor < (256 - prio)); + return prio = factor; } /* Spill pages from the dirty list back to disk. @@ -8578,6 +8567,9 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, if (!MDBX_FAKE_SPILL_WRITEMAP && ctx.flush_end > ctx.flush_begin) { MDBX_env *const env = txn->mt_env; +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ rc = mdbx_msync(&env->me_dxb_mmap, pgno_align2os_bytes(env, ctx.flush_begin), pgno_align2os_bytes(env, ctx.flush_end - ctx.flush_begin), @@ -8646,21 +8638,20 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, * тем самым повышая их шансы на выталкивание. */ /* get min/max of LRU-labels */ - unsigned lru_min = dl->items[1].lru, lru_max = lru_min; - for (unsigned i = 2; i <= dl->length; ++i) { - lru_min = (lru_min < dl->items[i].lru) ? lru_min : dl->items[i].lru; - lru_max = (lru_max > dl->items[i].lru) ? lru_max : dl->items[i].lru; + uint32_t age_max = 0; + for (unsigned i = 1; i <= dl->length; ++i) { + const uint32_t age = mdbx_dpl_age(txn, i); + age_max = (age_max >= age) ? age_max : age; } - mdbx_verbose("lru-head %u, lru-min %u, lru-max %u", txn->tw.dirtylru, lru_min, - lru_max); + mdbx_verbose("lru-head %u, age-max %u", txn->tw.dirtylru, age_max); /* half of 8-bit radix-sort */ unsigned radix_counters[256], spillable = 0, spilled = 0; memset(&radix_counters, 0, sizeof(radix_counters)); - unsigned const reciprocal = 255 * 256 / (lru_max - lru_min + 1); + const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1); for (unsigned i = 1; i <= dl->length; ++i) { - unsigned prio = spill_prio(txn, i, lru_min, reciprocal); + unsigned prio = spill_prio(txn, i, reciprocal); if (prio < 256) { radix_counters[prio] += 1; spillable += 1; @@ -8691,7 +8682,7 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, unsigned r, w, prio; for (w = 0, r = 1; r <= dl->length && spilled < wanna_spill; prev_prio = prio, ++r) { - prio = spill_prio(txn, r, lru_min, reciprocal); + prio = spill_prio(txn, r, reciprocal); MDBX_page *const dp = dl->items[r].ptr; if (prio < prio2adjacent) { const pgno_t pgno = dl->items[r].pgno; @@ -8700,9 +8691,9 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, if (prev_prio < prio2adjacent && prev_prio > prio2spill && dpl_endpgno(dl, r - 1) == pgno) { mdbx_debug("co-spill %u prev-adjacent page %" PRIaPGNO - " (lru-dist %d, prio %u)", + " (age %d, prio %u)", dpl_npages(dl, w), dl->items[r - 1].pgno, - txn->tw.dirtylru - dl->items[r - 1].lru, prev_prio); + mdbx_dpl_age(txn, r - 1), prev_prio); --w; rc = spill_page(txn, &ctx, dl->items[r - 1].ptr, dpl_npages(dl, r - 1)); @@ -8711,9 +8702,8 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, ++spilled; } - mdbx_debug("spill %u page %" PRIaPGNO " (lru-dist %d, prio %u)", - npages, dp->mp_pgno, txn->tw.dirtylru - dl->items[r].lru, - prio); + mdbx_debug("spill %u page %" PRIaPGNO " (age %d, prio %u)", npages, + dp->mp_pgno, mdbx_dpl_age(txn, r), prio); rc = spill_page(txn, &ctx, dp, npages); if (unlikely(rc != MDBX_SUCCESS)) break; @@ -8723,9 +8713,8 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, if (prev_prio <= prio2spill && dpl_endpgno(dl, r - 1) == pgno) { mdbx_debug("co-spill %u next-adjacent page %" PRIaPGNO - " (lru-dist %d, prio %u)", - npages, dp->mp_pgno, txn->tw.dirtylru - dl->items[r].lru, - prio); + " (age %d, prio %u)", + npages, dp->mp_pgno, mdbx_dpl_age(txn, r), prio); rc = spill_page(txn, &ctx, dp, npages); if (unlikely(rc != MDBX_SUCCESS)) break; @@ -8763,10 +8752,9 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, for (unsigned i = 1; i <= dl->length; ++i) { MDBX_page *dp = dl->items[i].ptr; mdbx_notice( - "dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, lru %u, prio %u", i, - dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, - txn->tw.dirtylru - dl->items[i].lru, - spill_prio(txn, i, lru_min, reciprocal)); + "dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", i, + dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, mdbx_dpl_age(txn, i), + spill_prio(txn, i, reciprocal)); } } @@ -9157,7 +9145,7 @@ static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp, } #if !(defined(_WIN32) || defined(_WIN64)) -static __always_inline __maybe_unused int ignore_enosys(int err) { +MDBX_MAYBE_UNUSED static __always_inline int ignore_enosys(int err) { #ifdef ENOSYS if (err == ENOSYS) return MDBX_RESULT_TRUE; @@ -9313,13 +9301,6 @@ static __cold int mdbx_set_readahead(MDBX_env *env, const pgno_t edge, static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, const pgno_t size_pgno, const pgno_t limit_pgno, const bool implicit) { - if ((env->me_flags & MDBX_WRITEMAP) && env->me_lck->mti_unsynced_pages.weak) { - int err = mdbx_msync(&env->me_dxb_mmap, 0, - pgno_align2os_bytes(env, used_pgno), true); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } - const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno); const size_t size_bytes = pgno_align2os_bytes(env, size_pgno); const size_t prev_size = env->me_dxb_mmap.current; @@ -9337,6 +9318,8 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, mdbx_assert(env, bytes2pgno(env, size_bytes) >= size_pgno); mdbx_assert(env, bytes2pgno(env, limit_bytes) >= limit_pgno); + unsigned mresize_flags = + env->me_flags & (MDBX_RDONLY | MDBX_WRITEMAP | MDBX_UTTERLY_NOSYNC); #if defined(_WIN32) || defined(_WIN64) /* Acquire guard in exclusive mode for: * - to avoid collision between read and write txns around env->me_dbgeo; @@ -9350,28 +9333,30 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, size_bytes == env->me_dxb_mmap.filesize) goto bailout; - /* 1) Windows allows only extending a read-write section, but not a - * corresponding mapped view. Therefore in other cases we must suspend - * the local threads for safe remap. - * 2) At least on Windows 10 1803 the entire mapped section is unavailable - * for short time during NtExtendSection() or VirtualAlloc() execution. - * 3) Under Wine runtime environment on Linux a section extending is not - * supported. Therefore thread suspending is always required. - * - * THEREFORE LOCAL THREADS SUSPENDING IS ALWAYS REQUIRED! */ - array_onstack.limit = ARRAY_LENGTH(array_onstack.handles); - array_onstack.count = 0; - suspended = &array_onstack; - rc = mdbx_suspend_threads_before_remap(env, &suspended); - if (rc != MDBX_SUCCESS) { - mdbx_error("failed suspend-for-remap: errcode %d", rc); - goto bailout; + if ((env->me_flags & MDBX_NOTLS) == 0) { + /* 1) Windows allows only extending a read-write section, but not a + * corresponding mapped view. Therefore in other cases we must suspend + * the local threads for safe remap. + * 2) At least on Windows 10 1803 the entire mapped section is unavailable + * for short time during NtExtendSection() or VirtualAlloc() execution. + * 3) Under Wine runtime environment on Linux a section extending is not + * supported. + * + * THEREFORE LOCAL THREADS SUSPENDING IS ALWAYS REQUIRED! */ + array_onstack.limit = ARRAY_LENGTH(array_onstack.handles); + array_onstack.count = 0; + suspended = &array_onstack; + rc = mdbx_suspend_threads_before_remap(env, &suspended); + if (rc != MDBX_SUCCESS) { + mdbx_error("failed suspend-for-remap: errcode %d", rc); + goto bailout; + } + mresize_flags |= implicit ? MDBX_MRESIZE_MAY_UNMAP + : MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; } - const bool mapping_can_be_moved = !implicit; -#else /* Windows */ +#else /* Windows */ /* Acquire guard to avoid collision between read and write txns * around env->me_dbgeo */ - bool mapping_can_be_moved = false; int rc = mdbx_fastmutex_acquire(&env->me_remap_guard); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -9380,7 +9365,8 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, goto bailout; MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (limit_bytes != env->me_dxb_mmap.limit && lck && !implicit) { + if (limit_bytes != env->me_dxb_mmap.limit && !(env->me_flags & MDBX_NOTLS) && + lck && !implicit) { int err = mdbx_rdt_lock(env) /* lock readers table until remap done */; if (unlikely(MDBX_IS_ERROR(err))) { rc = err; @@ -9390,21 +9376,31 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, /* looking for readers from this process */ const unsigned snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - mapping_can_be_moved = true; + mresize_flags |= implicit ? MDBX_MRESIZE_MAY_UNMAP + : MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; for (unsigned i = 0; i < snap_nreaders; ++i) { if (lck->mti_readers[i].mr_pid.weak == env->me_pid && lck->mti_readers[i].mr_tid.weak != mdbx_thread_self()) { /* the base address of the mapping can't be changed since * the other reader thread from this process exists. */ mdbx_rdt_unlock(env); - mapping_can_be_moved = false; + mresize_flags &= ~(MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE); break; } } } - #endif /* ! Windows */ + if ((env->me_flags & MDBX_WRITEMAP) && env->me_lck->mti_unsynced_pages.weak) { +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), + MDBX_SYNC_NONE); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + #if MDBX_ENABLE_MADVISE if (size_bytes < prev_size) { mdbx_notice("resize-MADV_%s %u..%u", @@ -9442,8 +9438,7 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, } #endif /* MDBX_ENABLE_MADVISE */ - rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, size_bytes, limit_bytes, - mapping_can_be_moved); + rc = mdbx_mresize(mresize_flags, &env->me_dxb_mmap, size_bytes, limit_bytes); #if MDBX_ENABLE_MADVISE if (rc == MDBX_SUCCESS) { @@ -9478,7 +9473,7 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, } #endif /* MDBX_USE_VALGRIND */ } else { - if (rc != MDBX_UNABLE_EXTEND_MAPSIZE) { + if (rc != MDBX_UNABLE_EXTEND_MAPSIZE && rc != MDBX_RESULT_TRUE) { mdbx_error("failed resize datafile/mapping: " "present %" PRIuPTR " -> %" PRIuPTR ", " "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", @@ -9506,7 +9501,8 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, mdbx_free(suspended); } #else - if (env->me_lck_mmap.lck && mapping_can_be_moved) + if (env->me_lck_mmap.lck && + (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) != 0) mdbx_rdt_unlock(env); int err = mdbx_fastmutex_release(&env->me_remap_guard); #endif /* Windows */ @@ -9533,7 +9529,7 @@ static __cold int mdbx_mapresize_implicit(MDBX_env *env, const pgno_t used_pgno, } static int mdbx_meta_unsteady(MDBX_env *env, const txnid_t last_steady, - MDBX_meta *const meta) { + MDBX_meta *const meta, mdbx_filehandle_t fd) { const uint64_t wipe = MDBX_DATASIGN_NONE; if (unlikely(META_IS_STEADY(meta)) && mdbx_meta_txnid_stable(env, meta) <= last_steady) { @@ -9541,62 +9537,59 @@ static int mdbx_meta_unsteady(MDBX_env *env, const txnid_t last_steady, data_page(meta)->mp_pgno); if (env->me_flags & MDBX_WRITEMAP) unaligned_poke_u64(4, meta->mm_datasync_sign, wipe); - else { -#if MDBX_ENABLE_PGOP_STAT - safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); -#endif /* MDBX_ENABLE_PGOP_STAT */ - return mdbx_pwrite(env->me_lazy_fd, &wipe, sizeof(meta->mm_datasync_sign), + else + return mdbx_pwrite(fd, &wipe, sizeof(meta->mm_datasync_sign), (uint8_t *)&meta->mm_datasync_sign - env->me_map); - } } return MDBX_SUCCESS; } __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) { - int err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 0)); +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ + const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) + ? env->me_dsync_fd + : env->me_lazy_fd; + int err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 0), fd); if (unlikely(err != MDBX_SUCCESS)) return err; - err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 1)); + err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 1), fd); if (unlikely(err != MDBX_SUCCESS)) return err; - err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 2)); + err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 2), fd); if (unlikely(err != MDBX_SUCCESS)) return err; if (env->me_flags & MDBX_WRITEMAP) { mdbx_flush_incoherent_cpu_writeback(); err = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), - false); + MDBX_SYNC_DATA); if (unlikely(err != MDBX_SUCCESS)) return err; } else { + if (fd == env->me_lazy_fd) { #if MDBX_USE_SYNCFILERANGE - static bool syncfilerange_unavailable; - if (likely(!syncfilerange_unavailable)) { - if (likely(!sync_file_range( - env->me_lazy_fd, 0, pgno2bytes(env, NUM_METAS), - SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER))) - goto done_filesync; - err = errno; - if (ignore_enosys(err) != MDBX_RESULT_TRUE) + static bool syncfilerange_unavailable; + if (!syncfilerange_unavailable && + sync_file_range(env->me_lazy_fd, 0, pgno2bytes(env, NUM_METAS), + SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER)) { + err = errno; + if (ignore_enosys(err) == MDBX_RESULT_TRUE) + syncfilerange_unavailable = true; + } + if (syncfilerange_unavailable) +#endif /* MDBX_USE_SYNCFILERANGE */ + err = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); + if (unlikely(err != MDBX_SUCCESS)) return err; - syncfilerange_unavailable = true; } -#endif /* MDBX_USE_SYNCFILERANGE */ - err = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); - if (unlikely(err != MDBX_SUCCESS)) - return err; -#if MDBX_USE_SYNCFILERANGE - done_filesync: -#endif /* MDBX_USE_SYNCFILERANGE */ mdbx_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), env->me_os_psize); } - MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (likely(lck)) - /* force oldest refresh */ - atomic_store32(&lck->mti_readers_refresh_flag, true, mo_Relaxed); + /* force oldest refresh */ + atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); return MDBX_SUCCESS; } @@ -10113,7 +10106,8 @@ __hot static struct page_result mdbx_page_alloc(MDBX_cursor *mc, } /* Copy the used portions of a non-overflow page. */ -__hot static void mdbx_page_copy(MDBX_page *dst, MDBX_page *src, size_t psize) { +__hot static void mdbx_page_copy(MDBX_page *dst, const MDBX_page *src, + size_t psize) { STATIC_ASSERT(UINT16_MAX > MAX_PAGESIZE - PAGEHDRSZ); STATIC_ASSERT(MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 4); if ((src->mp_flags & (P_LEAF2 | P_OVERFLOW)) == 0) { @@ -10138,7 +10132,7 @@ __hot static void mdbx_page_copy(MDBX_page *dst, MDBX_page *src, size_t psize) { * If a page being referenced was spilled to disk in this txn, bring * it back and make it dirty/writable again. */ static struct page_result __must_check_result -mdbx_page_unspill(MDBX_txn *const txn, MDBX_page *mp) { +mdbx_page_unspill(MDBX_txn *const txn, const MDBX_page *const mp) { mdbx_verbose("unspill page %" PRIaPGNO, mp->mp_pgno); mdbx_tassert(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); mdbx_tassert(txn, IS_SPILLED(txn, mp)); @@ -10176,8 +10170,13 @@ mdbx_page_unspill(MDBX_txn *const txn, MDBX_page *mp) { ret.page->mp_flags |= (scan == txn) ? 0 : P_SPILLED; ret.err = MDBX_SUCCESS; return ret; - } while ((scan = scan->mt_parent) != nullptr && - (scan->mt_flags & MDBX_TXN_SPILLS) != 0); + } while (likely((scan = scan->mt_parent) != nullptr && + (scan->mt_flags & MDBX_TXN_SPILLS) != 0)); + mdbx_error("Page %" PRIaPGNO " mod-txnid %" PRIaTXN + " not found in the spill-list(s), current txn %" PRIaTXN + " front %" PRIaTXN ", root txn %" PRIaTXN " front %" PRIaTXN, + mp->mp_pgno, mp->mp_txnid, txn->mt_txnid, txn->mt_front, + txn->mt_env->me_txn0->mt_txnid, txn->mt_env->me_txn0->mt_front); ret.err = MDBX_PROBLEM; ret.page = NULL; return ret; @@ -10190,7 +10189,8 @@ mdbx_page_unspill(MDBX_txn *const txn, MDBX_page *mp) { * * Returns 0 on success, non-zero on failure. */ __hot static int mdbx_page_touch(MDBX_cursor *mc) { - MDBX_page *const mp = mc->mc_pg[mc->mc_top], *np; + const MDBX_page *const mp = mc->mc_pg[mc->mc_top]; + MDBX_page *np; MDBX_txn *txn = mc->mc_txn; int rc; @@ -10243,7 +10243,7 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) { mdbx_page_copy(np, mp, txn->mt_env->me_psize); np->mp_pgno = pgno; np->mp_txnid = txn->mt_front; - } else if (!IS_SHADOWED(txn, mp)) { + } else if (IS_SPILLED(txn, mp)) { struct page_result pur = mdbx_page_unspill(txn, mp); np = pur.page; rc = pur.err; @@ -10254,6 +10254,12 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) { goto fail; } else { if (unlikely(!txn->mt_parent)) { + mdbx_error("Unexpected not frozen/modifiable/spilled but shadowed %s " + "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," + " without parent transaction, current txn %" PRIaTXN + " front %" PRIaTXN, + IS_BRANCH(mp) ? "branch" : "leaf", mp->mp_pgno, mp->mp_txnid, + mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); rc = MDBX_PROBLEM; goto fail; } @@ -10352,12 +10358,16 @@ __cold static int mdbx_env_sync_internal(MDBX_env *env, bool force, mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); const size_t usedbytes = pgno_align2os_bytes(env, head->mm_geo.next); +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ mdbx_txn_unlock(env); /* LY: pre-sync without holding lock to reduce latency for writer(s) */ - int err = (flags & MDBX_WRITEMAP) - ? mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, false) - : mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); + int err = + (flags & MDBX_WRITEMAP) + ? mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, MDBX_SYNC_DATA) + : mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -10398,9 +10408,13 @@ __cold static int mdbx_env_sync_internal(MDBX_env *env, bool force, const txnid_t head_txnid = mdbx_recent_committed_txnid(env); if (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != (uint32_t)head_txnid) { +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ rc = (flags & MDBX_WRITEMAP) ? mdbx_msync(&env->me_dxb_mmap, 0, - pgno_align2os_bytes(env, NUM_METAS), false) + pgno_align2os_bytes(env, NUM_METAS), + MDBX_SYNC_DATA | MDBX_SYNC_IODQ) : mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); if (likely(rc == MDBX_SUCCESS)) atomic_store32(&env->me_lck->mti_meta_sync_txnid, (uint32_t)head_txnid, @@ -10995,7 +11009,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { if (unlikely(rc != MDBX_SUCCESS)) goto bailout; txn->tw.dirtyroom = txn->mt_env->me_options.dp_limit; - txn->tw.dirtylru = 0; + txn->tw.dirtylru = MDBX_DEBUG ? ~42u : 0; } /* Setup db info */ @@ -11033,12 +11047,13 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { } if (txn->mt_flags & MDBX_TXN_RDONLY) { #if defined(_WIN32) || defined(_WIN64) - if ((size > env->me_dbgeo.lower && env->me_dbgeo.shrink) || - (mdbx_RunningUnderWine() && - /* under Wine acquisition of remap_guard is always required, - * since Wine don't support section extending, - * i.e. in both cases unmap+map are required. */ - size < env->me_dbgeo.upper && env->me_dbgeo.grow)) { + if (((size > env->me_dbgeo.lower && env->me_dbgeo.shrink) || + (mdbx_RunningUnderWine() && + /* under Wine acquisition of remap_guard is always required, + * since Wine don't support section extending, + * i.e. in both cases unmap+map are required. */ + size < env->me_dbgeo.upper && env->me_dbgeo.grow)) && + /* avoid recursive use SRW */ (txn->mt_flags & MDBX_NOTLS) == 0) { txn->mt_flags |= MDBX_SHRINK_ALLOWED; mdbx_srwlock_AcquireShared(&env->me_remap_guard); } @@ -11761,7 +11776,6 @@ static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode) { parent->mt_child = nullptr; parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; - mdbx_tassert(parent, parent->tw.dirtylru <= txn->tw.dirtylru); parent->tw.dirtylru = txn->tw.dirtylru; mdbx_tassert(parent, mdbx_dirtylist_check(parent)); mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0); @@ -12577,7 +12591,8 @@ static int mdbx_update_gc(MDBX_txn *txn) { reservation_gc_id < 1 || reservation_gc_id >= atomic_load64(&env->me_lck->mti_oldest_reader, mo_Relaxed))) { - mdbx_error("%s", "** internal error (reservation_gc_id)"); + mdbx_error("** internal error (reservation_gc_id %" PRIaTXN ")", + reservation_gc_id); rc = MDBX_PROBLEM; goto bailout; } @@ -13156,7 +13171,6 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, parent->tw.dirtyroom -= dst->sorted - dst->length; assert(parent->tw.dirtyroom <= parent->mt_env->me_options.dp_limit); dpl_setlen(dst, dst->sorted); - mdbx_tassert(parent, parent->tw.dirtylru <= txn->tw.dirtylru); parent->tw.dirtylru = txn->tw.dirtylru; mdbx_tassert(parent, mdbx_dirtylist_check(parent)); mdbx_dpl_free(txn); @@ -13692,8 +13706,8 @@ mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, uint64_t *filesize, /* Read the environment parameters of a DB environment * before mapping it into memory. */ static __cold int mdbx_read_header(MDBX_env *env, MDBX_meta *dest, - uint64_t *filesize, - const int lck_exclusive) { + uint64_t *filesize, const int lck_exclusive, + const mdbx_mode_t mode_bits) { int rc = mdbx_filesize(env->me_lazy_fd, filesize); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -13723,9 +13737,9 @@ static __cold int mdbx_read_header(MDBX_env *env, MDBX_meta *dest, int err = mdbx_pread(env->me_lazy_fd, buffer, MIN_PAGESIZE, offset); if (err != MDBX_SUCCESS) { if (err == MDBX_ENODATA && offset == 0 && loop_count == 0 && - *filesize == 0 && (env->me_flags & MDBX_RDONLY) == 0) - mdbx_warning("read meta: empty file (%d, %s)", err, - mdbx_strerror(err)); + *filesize == 0 && mode_bits /* non-zero for DB creation */ != 0) + mdbx_notice("read meta: empty file (%d, %s)", err, + mdbx_strerror(err)); else mdbx_error("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, mdbx_strerror(err)); @@ -14007,6 +14021,9 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, if (flags & MDBX_NOMETASYNC) mode_bits |= MDBX_SYNC_IODQ; } +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ if (flags & MDBX_WRITEMAP) rc = mdbx_msync(&env->me_dxb_mmap, 0, @@ -14090,6 +14107,9 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_ensure(env, target == head || mdbx_meta_txnid_stable(env, target) < unaligned_peek_u64(4, pending->mm_txnid_a)); +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ if (flags & MDBX_WRITEMAP) { mdbx_jitter4testing(true); if (likely(target != head)) { @@ -14762,11 +14782,12 @@ __cold int mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers) { #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ /* Further setup required for opening an MDBX environment */ -static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { +static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, + const mdbx_mode_t mode_bits) { uint64_t filesize_before; MDBX_meta meta; int rc = MDBX_RESULT_FALSE; - int err = mdbx_read_header(env, &meta, &filesize_before, lck_rc); + int err = mdbx_read_header(env, &meta, &filesize_before, lck_rc, mode_bits); if (unlikely(err != MDBX_SUCCESS)) { if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA || (env->me_flags & MDBX_RDONLY) != 0 || @@ -14798,7 +14819,7 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { return err; #ifndef NDEBUG /* just for checking */ - err = mdbx_read_header(env, &meta, &filesize_before, lck_rc); + err = mdbx_read_header(env, &meta, &filesize_before, lck_rc, mode_bits); if (unlikely(err != MDBX_SUCCESS)) return err; #endif @@ -14982,18 +15003,18 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { env->me_poison_edge = bytes2pgno(env, env->me_dxb_mmap.limit); #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ - const unsigned meta_clash_mask = mdbx_meta_eq_mask(env); - if (unlikely(meta_clash_mask)) { - if (/* not recovery mode */ env->me_stuck_meta < 0) { - mdbx_error("meta-pages are clashed: mask 0x%d", meta_clash_mask); - return MDBX_CORRUPTED; - } else { - mdbx_warning("ignore meta-pages clashing (mask 0x%d) in recovery mode", - meta_clash_mask); + while (likely(/* not recovery mode */ env->me_stuck_meta < 0)) { + const unsigned meta_clash_mask = mdbx_meta_eq_mask(env); + if (unlikely(meta_clash_mask)) { + if (/* not recovery mode */ env->me_stuck_meta < 0) { + mdbx_error("meta-pages are clashed: mask 0x%d", meta_clash_mask); + return MDBX_CORRUPTED; + } else { + mdbx_warning("ignore meta-pages clashing (mask 0x%d) in recovery mode", + meta_clash_mask); + } } - } - while (likely(/* not recovery mode */ env->me_stuck_meta < 0)) { MDBX_meta *const head = mdbx_meta_head(env); const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, head); MDBX_meta *const steady = mdbx_meta_steady(env); @@ -15057,6 +15078,9 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { head_txnid, steady_txnid, undo_txnid); mdbx_ensure(env, head_txnid == mdbx_meta_txnid_stable(env, head)); +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ if (env->me_flags & MDBX_WRITEMAP) { /* It is possible to update txnid without safe64_write(), * since DB opened exclusive for now */ @@ -15067,13 +15091,19 @@ static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { const size_t paged_offset = floor_powerof2(offset, env->me_os_psize); const size_t paged_length = ceil_powerof2( env->me_psize + offset - paged_offset, env->me_os_psize); - err = mdbx_msync(&env->me_dxb_mmap, paged_offset, paged_length, false); + err = mdbx_msync(&env->me_dxb_mmap, paged_offset, paged_length, + MDBX_SYNC_DATA | MDBX_SYNC_IODQ); } else { MDBX_meta rollback = *head; mdbx_meta_set_txnid(env, &rollback, undo_txnid); unaligned_poke_u64(4, rollback.mm_datasync_sign, MDBX_DATASIGN_WEAK); - err = mdbx_pwrite(env->me_lazy_fd, &rollback, sizeof(MDBX_meta), + const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) + ? env->me_dsync_fd + : env->me_lazy_fd; + err = mdbx_pwrite(fd, &rollback, sizeof(MDBX_meta), (uint8_t *)head - (uint8_t *)env->me_map); + if (err == MDBX_SUCCESS && fd == env->me_lazy_fd) + err = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); } if (err) { mdbx_error("error %d rollback from %" PRIaTXN ", to %" PRIaTXN @@ -15375,7 +15405,10 @@ static __cold int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, mdbx_jitter4testing(false); lck->mti_magic_and_version = MDBX_LOCK_MAGIC; lck->mti_os_and_format = MDBX_LOCK_FORMAT; - err = mdbx_msync(&env->me_lck_mmap, 0, (size_t)size, false); +#if MDBX_ENABLE_PGOP_STAT + lck->mti_pgop_stat.wops.weak = 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + err = mdbx_msync(&env->me_lck_mmap, 0, (size_t)size, MDBX_SYNC_NONE); if (unlikely(err != MDBX_SUCCESS)) { mdbx_error("initial-%s for lck-file failed", "msync"); goto bailout; @@ -15511,6 +15544,9 @@ __cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target_meta) { unaligned_poke_u64(4, meta->mm_datasync_sign, mdbx_meta_sign(meta)); } +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ if (env->me_flags & MDBX_WRITEMAP) { mdbx_flush_incoherent_cpu_writeback(); rc = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, target_meta), @@ -15519,9 +15555,6 @@ __cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target_meta) { const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) ? env->me_dsync_fd : env->me_lazy_fd; -#if MDBX_ENABLE_PGOP_STAT - safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); -#endif /* MDBX_ENABLE_PGOP_STAT */ rc = mdbx_pwrite(fd, page, env->me_psize, pgno2bytes(env, target_meta)); if (rc == MDBX_SUCCESS && fd == env->me_lazy_fd) rc = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); @@ -15741,9 +15774,12 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (flags & ~ENV_USABLE_FLAGS) + if (unlikely(flags & ~ENV_USABLE_FLAGS)) return MDBX_EINVAL; + if (flags & MDBX_RDONLY) + mode = 0; + if (env->me_lazy_fd != INVALID_HANDLE_VALUE || (env->me_flags & MDBX_ENV_ACTIVE) != 0 || env->me_map) return MDBX_EPERM; @@ -15891,7 +15927,7 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, } } - const int dxb_rc = mdbx_setup_dxb(env, lck_rc); + const int dxb_rc = mdbx_setup_dxb(env, lck_rc, mode); if (MDBX_IS_ERROR(dxb_rc)) { rc = dxb_rc; goto bailout; @@ -18103,8 +18139,16 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if (unlikely(pgr.err)) return pgr.err; } else { - if (unlikely(!mc->mc_txn->mt_parent)) + if (unlikely(!mc->mc_txn->mt_parent)) { + mdbx_error( + "Unexpected not frozen/modifiable/spilled but shadowed %s " + "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," + " without parent transaction, current txn %" PRIaTXN + " front %" PRIaTXN, + "overflow/large", pgno, pgr.page->mp_txnid, + mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); return MDBX_PROBLEM; + } /* It is writable only in a parent txn */ MDBX_page *np = mdbx_page_malloc(mc->mc_txn, ovpages); @@ -18464,8 +18508,8 @@ new_sub:; * make sure the cursor is marked valid. */ mc->mc_flags |= C_INITIALIZED; } - if (flags & MDBX_MULTIPLE) { - if (!rc) { + if (unlikely(flags & MDBX_MULTIPLE)) { + if (likely(rc == MDBX_SUCCESS)) { continue_multiple: mcount++; /* let caller know how many succeeded, if any */ @@ -18481,10 +18525,11 @@ new_sub:; rc = mdbx_cursor_check(mc, 0); return rc; bad_sub: - if (unlikely(rc == MDBX_KEYEXIST)) - mdbx_error("unexpected %s", "MDBX_KEYEXIST"); - /* should not happen, we deleted that item */ - rc = MDBX_PROBLEM; + if (unlikely(rc == MDBX_KEYEXIST)) { + /* should not happen, we deleted that item */ + mdbx_error("Unexpected %i error while put to nested dupsort's hive", rc); + rc = MDBX_PROBLEM; + } } mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; return rc; @@ -18760,10 +18805,16 @@ static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, } else if (unlikely(node_size(key, data) > mc->mc_txn->mt_env->me_leaf_nodemax)) { /* Put data on overflow page. */ - mdbx_ensure(mc->mc_txn->mt_env, - !F_ISSET(mc->mc_db->md_flags, MDBX_DUPSORT)); - if (unlikely(flags & (F_DUPDATA | F_SUBDATA))) + if (unlikely(mc->mc_db->md_flags & MDBX_DUPSORT)) { + mdbx_error("Unexpected target %s flags 0x%x for large data-item", + "dupsort-db", mc->mc_db->md_flags); + return MDBX_PROBLEM; + } + if (unlikely(flags & (F_DUPDATA | F_SUBDATA))) { + mdbx_error("Unexpected target %s flags 0x%x for large data-item", "node", + flags); return MDBX_PROBLEM; + } const pgno_t ovpages = number_of_ovpages(mc->mc_txn->mt_env, data->iov_len); const struct page_result npr = mdbx_page_new(mc, P_OVERFLOW, ovpages); if (unlikely(npr.err != MDBX_SUCCESS)) @@ -19236,8 +19287,13 @@ int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { } if (mc->mc_signature == MDBX_MC_LIVE) { - if (unlikely(!mc->mc_txn || mc->mc_txn->mt_signature != MDBX_MT_SIGNATURE)) + if (unlikely(!mc->mc_txn || + mc->mc_txn->mt_signature != MDBX_MT_SIGNATURE)) { + mdbx_error("Wrong cursor's transaction %p 0x%x", + __Wpedantic_format_voidptr(mc->mc_txn), + mc->mc_txn ? mc->mc_txn->mt_signature : 0); return MDBX_PROBLEM; + } if (mc->mc_flags & C_UNTRACK) { mdbx_cassert(mc, !(mc->mc_txn->mt_flags & MDBX_TXN_RDONLY)); MDBX_cursor **prev = &mc->mc_txn->tw.cursors[mc->mc_dbi]; @@ -19498,6 +19554,8 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { mdbx_cassert(csrc, csrc->mc_top == cdst->mc_top); if (unlikely(PAGETYPE(psrc) != PAGETYPE(pdst))) { bailout: + mdbx_error("Wrong or mismatch pages's types (src %d, dst %d) to move node", + PAGETYPE(psrc), PAGETYPE(pdst)); csrc->mc_txn->mt_flags |= MDBX_TXN_ERROR; return MDBX_PROBLEM; } @@ -20312,6 +20370,11 @@ static int mdbx_rebalance(MDBX_cursor *mc) { room_threshold = 0; goto retry; } + mdbx_error("Unable to merge/rebalance %s page %" PRIaPGNO + " (has %u keys, full %.1f%%, used %u, room %u bytes )", + (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, + page_fill(mc->mc_txn->mt_env, tp), + page_used(mc->mc_txn->mt_env, tp), room); return MDBX_PROBLEM; } @@ -21191,7 +21254,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, rc = mdbx_cursor_sibling(mc, SIBLING_LEFT); if (unlikely(rc != MDBX_SUCCESS)) { if (rc == MDBX_NOTFOUND) /* improper mdbx_cursor_sibling() result */ { - mdbx_error("unexpected %s", "MDBX_NOTFOUND"); + mdbx_error("unexpected %i error going left sibling", rc); rc = MDBX_PROBLEM; } goto done; @@ -21200,9 +21263,9 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } } else if (unlikely(pure_left)) { MDBX_page *ptop_page = mc->mc_pg[ptop]; - mdbx_notice("adding to parent page %u node[%u] left-leaf page #%u key %s", - ptop_page->mp_pgno, mc->mc_ki[ptop], sister->mp_pgno, - DKEY(mc->mc_ki[ptop] ? newkey : NULL)); + mdbx_debug("adding to parent page %u node[%u] left-leaf page #%u key %s", + ptop_page->mp_pgno, mc->mc_ki[ptop], sister->mp_pgno, + DKEY(mc->mc_ki[ptop] ? newkey : NULL)); mc->mc_top--; rc = mdbx_node_add_branch(mc, mc->mc_ki[ptop], mc->mc_ki[ptop] ? newkey : NULL, sister->mp_pgno); @@ -21210,7 +21273,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, newindx == mc->mc_ki[ptop + 1] && ptop == mc->mc_top); if (likely(rc == MDBX_SUCCESS) && mc->mc_ki[ptop] == 0) { - mdbx_notice("update prev-first key on parent %s", DKEY(&sepkey)); + mdbx_debug("update prev-first key on parent %s", DKEY(&sepkey)); MDBX_node *node = page_node(mc->mc_pg[ptop], 1); mdbx_cassert(mc, node_ks(node) == 0 && node_pgno(node) == mp->mp_pgno); mdbx_cassert(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 0); @@ -21269,9 +21332,9 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, &sepkey); if (mc->mc_dbx->md_cmp(newkey, &sepkey) < 0) { mc->mc_top -= i; - mdbx_notice("update new-first on parent [%i] page %u key %s", - mc->mc_ki[mc->mc_top], mc->mc_pg[mc->mc_top]->mp_pgno, - DKEY(newkey)); + mdbx_debug("update new-first on parent [%i] page %u key %s", + mc->mc_ki[mc->mc_top], mc->mc_pg[mc->mc_top]->mp_pgno, + DKEY(newkey)); rc = mdbx_update_key(mc, newkey); mc->mc_top += i; if (unlikely(rc != MDBX_SUCCESS)) @@ -22596,20 +22659,19 @@ __cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, if (likely(bytes > size_before_bootid)) { arg->mi_unsync_volume = pgno2bytes(env, unsynced_pages); const uint64_t monotime_now = mdbx_osal_monotime(); - arg->mi_since_sync_seconds16dot16 = mdbx_osal_monotime_to_16dot16( - monotime_now - atomic_load64(&lck->mti_sync_timestamp, mo_Relaxed)); + uint64_t ts = atomic_load64(&lck->mti_sync_timestamp, mo_Relaxed); + arg->mi_since_sync_seconds16dot16 = + ts ? mdbx_osal_monotime_to_16dot16(monotime_now - ts) : 0; + ts = atomic_load64(&lck->mti_reader_check_timestamp, mo_Relaxed); arg->mi_since_reader_check_seconds16dot16 = - lck ? mdbx_osal_monotime_to_16dot16( - monotime_now - - atomic_load64(&lck->mti_reader_check_timestamp, mo_Relaxed)) - : 0; + ts ? mdbx_osal_monotime_to_16dot16(monotime_now - ts) : 0; arg->mi_autosync_threshold = pgno2bytes( env, atomic_load32(&lck->mti_autosync_threshold, mo_Relaxed)); arg->mi_autosync_period_seconds16dot16 = mdbx_osal_monotime_to_16dot16( atomic_load64(&lck->mti_autosync_period, mo_Relaxed)); arg->mi_bootid.current.x = bootid.x; arg->mi_bootid.current.y = bootid.y; - arg->mi_mode = lck ? lck->mti_envmode.weak : env->me_flags; + arg->mi_mode = env->me_lck_mmap.lck ? lck->mti_envmode.weak : env->me_flags; } if (likely(bytes > size_before_pgop_stat)) { @@ -24010,17 +24072,11 @@ int mdbx_cursor_eof(const MDBX_cursor *mc) { return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL : MDBX_EBADSIGN; - if ((mc->mc_flags & C_INITIALIZED) == 0) - return MDBX_RESULT_TRUE; - - if (mc->mc_snum == 0) - return MDBX_RESULT_TRUE; - - if ((mc->mc_flags & C_EOF) && - mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top])) - return MDBX_RESULT_TRUE; - - return MDBX_RESULT_FALSE; + return ((mc->mc_flags & (C_INITIALIZED | C_EOF)) == C_INITIALIZED && + mc->mc_snum && + mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top])) + ? MDBX_RESULT_FALSE + : MDBX_RESULT_TRUE; } //------------------------------------------------------------------------------ @@ -24061,8 +24117,10 @@ __hot static int cursor_diff(const MDBX_cursor *const __restrict x, return MDBX_ENODATA; while (likely(r->level < y->mc_snum && r->level < x->mc_snum)) { - if (unlikely(y->mc_pg[r->level] != x->mc_pg[r->level])) + if (unlikely(y->mc_pg[r->level] != x->mc_pg[r->level])) { + mdbx_error("Mismatch cursors's pages at %u level", r->level); return MDBX_PROBLEM; + } int nkeys = page_numkeys(y->mc_pg[r->level]); assert(nkeys > 0); @@ -25337,7 +25395,7 @@ __dll_export #ifdef MDBX_BUILD_TIMESTAMP MDBX_BUILD_TIMESTAMP #else - __DATE__ " " __TIME__ + "\"" __DATE__ " " __TIME__ "\"" #endif /* MDBX_BUILD_TIMESTAMP */ , @@ -25502,7 +25560,7 @@ __dll_export #else #ifdef __INTEL_COMPILER "Intel C/C++ " STRINGIFY(__INTEL_COMPILER) - #elsif defined(__apple_build_version__) + #elif defined(__apple_build_version__) "Apple clang " STRINGIFY(__apple_build_version__) #elif defined(__ibmxl__) "IBM clang C " STRINGIFY(__ibmxl_version__) "." STRINGIFY(__ibmxl_release__) @@ -26899,6 +26957,7 @@ static int mdbx_check_fs_local(mdbx_filehandle_t handle, int flags) { strncasecmp("cifs", name, name_len) == 0 || strncasecmp("ncpfs", name, name_len) == 0 || strncasecmp("smbfs", name, name_len) == 0 || + strcasecmp("9P" /* WSL2 */, name) == 0 || ((name_len > 3 && strncasecmp("fuse", name, 4) == 0) && strncasecmp("fuseblk", name, name_len) != 0)) && !(flags & MDBX_EXCLUSIVE)) @@ -27104,8 +27163,8 @@ MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map) { return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size, - size_t limit, const bool may_move) { +MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, + size_t size, size_t limit) { assert(size <= limit); #if defined(_WIN32) || defined(_WIN64) assert(size != map->current || limit != map->limit || size < map->filesize); @@ -27149,6 +27208,9 @@ MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size, * - change size of mapped view; * - extend read-only mapping; * Therefore we should unmap/map entire section. */ + if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) + return MDBX_RESULT_TRUE; + status = NtUnmapViewOfSection(GetCurrentProcess(), map->address); if (!NT_SUCCESS(status)) return ntstatus2errcode(status); @@ -27184,7 +27246,7 @@ MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size, if (status != (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018) goto bailout_ntstatus /* no way to recovery */; - if (may_move) + if (flags & MDBX_MRESIZE_MAY_MOVE) /* the base address could be changed */ map->address = NULL; } @@ -27242,7 +27304,7 @@ retry_mapview:; if (!NT_SUCCESS(status)) { if (status == (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018 && - map->address && may_move) { + map->address && (flags & MDBX_MRESIZE_MAY_MOVE) != 0) { /* try remap at another base address */ map->address = NULL; goto retry_mapview; @@ -27253,7 +27315,7 @@ retry_mapview:; if (map->address && (size != map->current || limit != map->limit)) { /* try remap with previously size and limit, * but will return MDBX_UNABLE_EXTEND_MAPSIZE on success */ - rc = MDBX_UNABLE_EXTEND_MAPSIZE; + rc = (limit > map->limit) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_RESULT_TRUE; size = map->current; ReservedSize = limit = map->limit; goto retry_file_and_section; @@ -27277,7 +27339,8 @@ retry_mapview:; if (flags & MDBX_RDONLY) { map->current = (filesize > limit) ? limit : (size_t)filesize; if (map->current != size) - rc = MDBX_UNABLE_EXTEND_MAPSIZE; + rc = + (size > map->current) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_RESULT_TRUE; } else if (filesize != size) { rc = mdbx_ftruncate(map->fd, size); if (rc != MDBX_SUCCESS) @@ -27300,7 +27363,8 @@ retry_mapview:; uint8_t *ptr = MAP_FAILED; #if defined(MREMAP_MAYMOVE) - ptr = mremap(map->address, map->limit, limit, may_move ? MREMAP_MAYMOVE : 0); + ptr = mremap(map->address, map->limit, limit, + (flags & MDBX_MRESIZE_MAY_MOVE) ? MREMAP_MAYMOVE : 0); if (ptr == MAP_FAILED) { const int err = errno; switch (err) { @@ -27349,7 +27413,7 @@ retry_mapview:; if (ptr == MAP_FAILED) { /* unmap and map again whole region */ - if (!may_move) { + if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) { /* TODO: Perhaps here it is worth to implement suspend/resume threads * and perform unmap/map as like for Windows. */ return MDBX_UNABLE_EXTEND_MAPSIZE; @@ -27358,9 +27422,31 @@ retry_mapview:; if (unlikely(munmap(map->address, map->limit))) return errno; - ptr = mmap(map->address, limit, mmap_prot, mmap_flags, map->fd, 0); + ptr = mmap(map->address, limit, mmap_prot, + (flags & MDBX_MRESIZE_MAY_MOVE) + ? mmap_flags + : mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE + : MAP_FIXED), + map->fd, 0); + if (MAP_FIXED_NOREPLACE != 0 && MAP_FIXED_NOREPLACE != MAP_FIXED && + unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) && + errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL) + ptr = mmap(map->address, limit, mmap_prot, mmap_flags | MAP_FIXED, + map->fd, 0); + if (unlikely(ptr == MAP_FAILED)) { - ptr = mmap(map->address, map->limit, mmap_prot, mmap_flags, map->fd, 0); + /* try to restore prev mapping */ + ptr = mmap(map->address, map->limit, mmap_prot, + (flags & MDBX_MRESIZE_MAY_MOVE) + ? mmap_flags + : mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE + : MAP_FIXED), + map->fd, 0); + if (MAP_FIXED_NOREPLACE != 0 && MAP_FIXED_NOREPLACE != MAP_FIXED && + unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) && + errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL) + ptr = mmap(map->address, map->limit, mmap_prot, mmap_flags | MAP_FIXED, + map->fd, 0); if (unlikely(ptr == MAP_FAILED)) { VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); /* Unpoisoning is required for ASAN to avoid false-positive diagnostic @@ -27478,7 +27564,8 @@ mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16) { #else const uint64_t ratio = UINT64_C(1000000000); #endif - return (ratio * seconds_16dot16 + 32768) >> 16; + const uint64_t ret = (ratio * seconds_16dot16 + 32768) >> 16; + return likely(ret || seconds_16dot16 == 0) ? ret : /* fix underflow */ 1; } MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime) { @@ -27490,13 +27577,15 @@ MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime) { if (monotime > limit) return UINT32_MAX; } + const uint32_t ret = #if defined(_WIN32) || defined(_WIN64) - return (uint32_t)((monotime << 16) / performance_frequency.QuadPart); + (uint32_t)((monotime << 16) / performance_frequency.QuadPart); #elif defined(__APPLE__) || defined(__MACH__) - return (uint32_t)((monotime << 16) / ratio_16dot16_to_monotine); + (uint32_t)((monotime << 16) / ratio_16dot16_to_monotine); #else - return (uint32_t)(monotime * 128 / 1953125); + (uint32_t)(monotime * 128 / 1953125); #endif + return likely(ret || monotime == 0) ? ret : /* fix underflow */ 1; } MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void) { @@ -27627,8 +27716,8 @@ static LSTATUS mdbx_RegGetValue(HKEY hKey, LPCSTR lpSubKey, LPCSTR lpValue, } #endif -static __cold __maybe_unused bool bootid_parse_uuid(bin128_t *s, const void *p, - const size_t n) { +MDBX_MAYBE_UNUSED static __cold bool +bootid_parse_uuid(bin128_t *s, const void *p, const size_t n) { if (n > 31) { unsigned bits = 0; for (unsigned i = 0; i < n; ++i) /* try parse an UUID in text form */ { @@ -27983,7 +28072,7 @@ __cold int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages, if (unlikely(pagesize < MIN_PAGESIZE || !is_powerof2(pagesize))) return MDBX_INCOMPATIBLE; - __maybe_unused const int log2page = log2n_powerof2(pagesize); + MDBX_MAYBE_UNUSED const int log2page = log2n_powerof2(pagesize); assert(pagesize == (INT64_C(1) << log2page)); (void)log2page; @@ -28114,10 +28203,10 @@ __dll_export const struct MDBX_version_info mdbx_version = { 0, 10, - 0, - 0, - {"2021-05-09T03:01:59+03:00", "794e1a9437599eaf67ef14c38adfc811ebba47cd", "aa1f6fbd5f6d39f92c5dd771fb521ea533a2358a", - "v0.10.0-0-gaa1f6fbd"}, + 1, + 15, + {"2021-06-18T15:13:51+03:00", "1c2ca15627c5c4e72657c00530c8a9a71ccd5128", "63e7276c7da864d47c004cc959dd8c6b1731c247", + "v0.10.1-15-g63e7276c"}, sourcery}; __dll_export @@ -28394,6 +28483,7 @@ static int suspend_and_append(mdbx_handle_array_t **array, MDBX_INTERNAL_FUNC int mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { + mdbx_assert(env, (env->me_flags & MDBX_NOTLS) == 0); const uintptr_t CurrentTid = GetCurrentThreadId(); int rc; if (env->me_lck_mmap.lck) { @@ -28411,12 +28501,6 @@ mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { if (reader->mr_tid.weak == CurrentTid || reader->mr_tid.weak == WriteTxnOwner) goto skip_lck; - if (env->me_flags & MDBX_NOTLS) { - /* Skip duplicates in no-tls mode */ - for (const MDBX_reader *scan = reader; --scan >= begin;) - if (scan->mr_tid.weak == reader->mr_tid.weak) - goto skip_lck; - } rc = suspend_and_append(array, (mdbx_tid_t)reader->mr_tid.weak); if (rc != MDBX_SUCCESS) { @@ -28982,7 +29066,11 @@ static __cold uint8_t probe_for_WSL(const char *tag) { const char *const wsl = strstr(tag, "wsl"); if (wsl && wsl[3] >= '2' && wsl[3] <= '9') return wsl[3] - '0'; - return (WSL || wsl || strcasestr(tag, "Microsoft")) ? 1 : 0; + if (WSL || wsl || strcasestr(tag, "Microsoft")) + /* Expecting no new kernel within WSL1, either it will explicitly + * marked by an appropriate WSL-version hint. */ + return (mdbx_linux_kernel_version < /* 4.19.x */ 0x04130000) ? 1 : 2; + return 0; } #endif /* Linux */ @@ -28992,16 +29080,6 @@ mdbx_global_constructor(void) { #if defined(__linux__) || defined(__gnu_linux__) struct utsname buffer; if (uname(&buffer) == 0) { - /* "Official" way of detecting WSL1 but not WSL2 - * https://github.com/Microsoft/WSL/issues/423#issuecomment-221627364 - * - * WARNING: False negative detection of WSL1 will result in DATA LOSS! - * So, the REQUIREMENTS for this code: - * 1. MUST detect WSL1 without false-negatives. - * 2. DESIRABLE detect WSL2 but without the risk of violating the first. */ - mdbx_RunningOnWSL1 = probe_for_WSL(buffer.version) == 1 || - probe_for_WSL(buffer.sysname) == 1 || - probe_for_WSL(buffer.release) == 1; int i = 0; char *p = buffer.release; while (*p && i < 4) { @@ -29017,6 +29095,16 @@ mdbx_global_constructor(void) { ++p; } } + /* "Official" way of detecting WSL1 but not WSL2 + * https://github.com/Microsoft/WSL/issues/423#issuecomment-221627364 + * + * WARNING: False negative detection of WSL1 will result in DATA LOSS! + * So, the REQUIREMENTS for this code: + * 1. MUST detect WSL1 without false-negatives. + * 2. DESIRABLE detect WSL2 but without the risk of violating the first. */ + mdbx_RunningOnWSL1 = probe_for_WSL(buffer.version) == 1 || + probe_for_WSL(buffer.sysname) == 1 || + probe_for_WSL(buffer.release) == 1; } #endif /* Linux */ @@ -29140,6 +29228,22 @@ static int lck_op(mdbx_filehandle_t fd, int cmd, int lck, off_t offset, return MDBX_SUCCESS; } rc = errno; +#if MDBX_USE_OFDLOCKS + if (rc == EINVAL && + (cmd == F_OFD_SETLK || cmd == F_OFD_SETLKW || cmd == F_OFD_GETLK)) { + /* fallback to non-OFD locks */ + if (cmd == F_OFD_SETLK) + cmd = F_SETLK; + else if (cmd == F_OFD_SETLKW) + cmd = F_SETLKW; + else + cmd = F_GETLK; + op_setlk = F_SETLK; + op_setlkw = F_SETLKW; + op_getlk = F_GETLK; + continue; + } +#endif /* MDBX_USE_OFDLOCKS */ if (rc != EINTR || cmd == op_setlkw) { mdbx_assert(nullptr, MDBX_IS_ERROR(rc)); return rc; diff --git a/mdbx/dist/mdbx.c++ b/mdbx/dist/mdbx.c++ index f9ab9cb..db406bd 100644 --- a/mdbx/dist/mdbx.c++ +++ b/mdbx/dist/mdbx.c++ @@ -12,7 +12,7 @@ * . */ #define xMDBX_ALLOY 1 -#define MDBX_BUILD_SOURCERY 10aa116f5f6a1fca4ccea1310d3d331a39161abc5b63b6a30e01812eab671e7c_v0_10_0_0_gaa1f6fbd +#define MDBX_BUILD_SOURCERY 220eee3b3a4b48cb20897d772c6665de65226e5811d687b6f7f43b2beeb9cb31_v0_10_1_15_g63e7276c #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -206,14 +206,6 @@ # endif #endif /* __must_check_result */ -#ifndef __maybe_unused -# if defined(__GNUC__) || __has_attribute(__unused__) -# define __maybe_unused __attribute__((__unused__)) -# else -# define __maybe_unused -# endif -#endif /* __maybe_unused */ - #if !defined(__noop) && !defined(_MSC_VER) # define __noop(...) do {} while(0) #endif /* __noop */ @@ -359,7 +351,7 @@ #endif /* __anonymous_struct_extension__ */ #ifndef __Wpedantic_format_voidptr - static __inline __maybe_unused const void* MDBX_PURE_FUNCTION + MDBX_MAYBE_UNUSED MDBX_PURE_FUNCTION static __inline const void* __Wpedantic_format_voidptr(const void* ptr) {return ptr;} # define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG) #endif /* __Wpedantic_format_voidptr */ @@ -971,7 +963,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t mdbx_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) SYSTEM_INFO si; @@ -1058,7 +1050,7 @@ extern void mdbx_osal_jitter(bool tiny); #include #endif -static __maybe_unused __inline void mdbx_compiler_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) __asm__ __volatile__("" ::: "memory"); #elif defined(_MSC_VER) @@ -1078,7 +1070,7 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { #endif } -static __maybe_unused __inline void mdbx_memory_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) @@ -1118,8 +1110,8 @@ static __maybe_unused __inline void mdbx_memory_barrier(void) { #define mdbx_asprintf asprintf #define mdbx_vasprintf vasprintf #else -MDBX_INTERNAL_FUNC MDBX_PRINTF_ARGS(2, 3) int __maybe_unused - mdbx_asprintf(char **strp, const char *fmt, ...); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC + MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...); MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #endif @@ -1142,7 +1134,7 @@ MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; LIBMDBX_API char *mdbx_strdup(const char *str); #endif -static __maybe_unused __inline int mdbx_get_errno(void) { +MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { #if defined(_WIN32) || defined(_WIN64) DWORD rc = GetLastError(); #else @@ -1226,8 +1218,10 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, const size_t must, const size_t limit, const unsigned options); MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); -MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, - size_t wanna, const bool may_move); +#define MDBX_MRESIZE_MAY_MOVE 0x00000100 +#define MDBX_MRESIZE_MAY_UNMAP 0x00000200 +MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, + size_t size, size_t limit); #if defined(_WIN32) || defined(_WIN64) typedef struct { unsigned limit, count; @@ -1244,7 +1238,7 @@ MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, const char *pathname, int err); -static __maybe_unused __inline uint32_t mdbx_getpid(void) { +MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentProcessId(); @@ -1253,7 +1247,7 @@ static __maybe_unused __inline uint32_t mdbx_getpid(void) { #endif } -static __maybe_unused __inline uintptr_t mdbx_thread_self(void) { +MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { mdbx_tid_t thunk; STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); #if defined(_WIN32) || defined(_WIN64) @@ -1264,7 +1258,7 @@ static __maybe_unused __inline uintptr_t mdbx_thread_self(void) { return (uintptr_t)thunk; } -MDBX_INTERNAL_FUNC void __maybe_unused mdbx_osal_jitter(bool tiny); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny); MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void); MDBX_INTERNAL_FUNC uint64_t mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); @@ -2035,7 +2029,7 @@ static __always_inline memory_order mo_c11_load(enum MDBX_memory_order fence) { static __inline void mdbx_jitter4testing(bool tiny); -static __maybe_unused __always_inline void +MDBX_MAYBE_UNUSED static __always_inline void mdbx_memory_fence(enum MDBX_memory_order order, bool write) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(write ? mo_c11_store(order) : mo_c11_load(order)); @@ -2047,7 +2041,7 @@ mdbx_memory_fence(enum MDBX_memory_order order, bool write) { #endif /* MDBX_HAVE_C11ATOMICS */ } -static __maybe_unused __always_inline uint32_t +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); @@ -2063,7 +2057,7 @@ atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, return value; } -static __maybe_unused __always_inline uint32_t +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); #ifdef MDBX_HAVE_C11ATOMICS @@ -2078,7 +2072,7 @@ atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { #endif /* MDBX_HAVE_C11ATOMICS */ } -static __maybe_unused __always_inline uint64_t +MDBX_MAYBE_UNUSED static __always_inline uint64_t atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8); @@ -2102,7 +2096,7 @@ atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, return value; } -static __maybe_unused +MDBX_MAYBE_UNUSED static #if MDBX_64BIT_ATOMIC __always_inline #endif /* MDBX_64BIT_ATOMIC */ @@ -2690,15 +2684,7 @@ typedef struct MDBX_dbx { * Every operation requires a transaction handle. */ struct MDBX_txn { #define MDBX_MT_SIGNATURE UINT32_C(0x93D53A31) - size_t mt_signature; - MDBX_txn *mt_parent; /* parent of a nested txn */ - /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ - MDBX_txn *mt_child; - MDBX_geo mt_geo; - /* next unallocated page */ -#define mt_next_pgno mt_geo.next - /* corresponding to the current size of datafile */ -#define mt_end_pgno mt_geo.now + uint32_t mt_signature; /* Transaction Flags */ /* mdbx_txn_begin() flags */ @@ -2727,8 +2713,17 @@ struct MDBX_txn { MDBX_SHRINK_ALLOWED) #error "Oops, some flags overlapped or wrong" #endif + uint32_t mt_flags; + + MDBX_txn *mt_parent; /* parent of a nested txn */ + /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ + MDBX_txn *mt_child; + MDBX_geo mt_geo; + /* next unallocated page */ +#define mt_next_pgno mt_geo.next + /* corresponding to the current size of datafile */ +#define mt_end_pgno mt_geo.now - unsigned mt_flags; /* The ID of this transaction. IDs are integers incrementing from 1. * Only committed write transactions increment the ID. If a transaction * aborts, the ID may be re-used by the next writer. */ @@ -3148,7 +3143,7 @@ void mdbx_assert_fail(const MDBX_env *env, const char *msg, const char *func, #define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier() #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ -static __maybe_unused __inline void +MDBX_MAYBE_UNUSED static __inline void mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); @@ -3190,7 +3185,7 @@ MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void); MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void); MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); -static __maybe_unused __inline void mdbx_jitter4testing(bool tiny) { +MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) { #if MDBX_DEBUG if (MDBX_DBG_JITTER & mdbx_runtime_flags) mdbx_osal_jitter(tiny); @@ -3343,35 +3338,35 @@ typedef struct MDBX_node { /* Do not spill pages to disk if txn is getting full, may fail instead */ #define MDBX_NOSPILL 0x8000 -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline pgno_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_add(pgno_t base, pgno_t augend) { assert(base <= MAX_PAGENO); return (augend < MAX_PAGENO - base) ? base + augend : MAX_PAGENO; } -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline pgno_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_sub(pgno_t base, pgno_t subtrahend) { assert(base >= MIN_PAGENO); return (subtrahend < base - MIN_PAGENO) ? base - subtrahend : MIN_PAGENO; } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused bool +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool is_powerof2(size_t x) { return (x & (x - 1)) == 0; } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t floor_powerof2(size_t value, size_t granularity) { assert(is_powerof2(granularity)); return value & ~(granularity - 1); } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t ceil_powerof2(size_t value, size_t granularity) { return floor_powerof2(value + granularity - 1, granularity); } -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused unsigned +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned log2n_powerof2(size_t value) { assert(value > 0 && value < INT32_MAX && is_powerof2(value)); assert((value & -(int32_t)value) == value); @@ -3401,7 +3396,7 @@ log2n_powerof2(size_t value) { #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS -static __maybe_unused void static_checks(void) { +MDBX_MAYBE_UNUSED static void static_checks(void) { STATIC_ASSERT_MSG(INT16_MAX - CORE_DBS == MDBX_MAX_DBI, "Oops, MDBX_MAX_DBI or CORE_DBS?"); STATIC_ASSERT_MSG((unsigned)(MDBX_DB_ACCEDE | MDBX_CREATE) == @@ -3619,7 +3614,8 @@ template struct path_to_pchar { operator const char *() const { return str.c_str(); } }; -template PATH pchar_to_path(const char *c_str) { +template +MDBX_MAYBE_UNUSED PATH pchar_to_path(const char *c_str) { return PATH(c_str); } @@ -3656,7 +3652,8 @@ template <> struct path_to_pchar { operator const char *() const { return str.c_str(); } }; -template <> std::wstring pchar_to_path(const char *c_str) { +template <> +MDBX_MAYBE_UNUSED std::wstring pchar_to_path(const char *c_str) { std::wstring wstr; if (c_str && *c_str) { const int chars = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, c_str, diff --git a/mdbx/dist/mdbx.h b/mdbx/dist/mdbx.h index f902b7b..7b719f4 100644 --- a/mdbx/dist/mdbx.h +++ b/mdbx/dist/mdbx.h @@ -454,6 +454,15 @@ typedef mode_t mdbx_mode_t; #endif #endif /* MDBX_PRINTF_ARGS */ +#if defined(DOXYGEN) || (__has_cpp_attribute(maybe_unused) && \ + (defined(__cplusplus) || __STDC_VERSION__ > 202005L)) +#define MDBX_MAYBE_UNUSED [[maybe_unused]] +#elif defined(__GNUC__) || __has_attribute(__unused__) +#define MDBX_MAYBE_UNUSED __attribute__((__unused__)) +#else +#define MDBX_MAYBE_UNUSED +#endif /* MDBX_MAYBE_UNUSED */ + /* Oh, below are some songs and dances since: * - C++ requires explicit definition of the necessary operators. * - the proper implementation of DEFINE_ENUM_FLAG_OPERATORS for C++ required @@ -2275,7 +2284,8 @@ struct MDBX_envinfo { /** Statistics of page operations. * \details Overall statistics of page operations of all (running, completed * and aborted) transactions in the current multi-process session (since the - * first process opened the database). */ + * first process opened the database after everyone had previously closed it). + */ struct { uint64_t newly; /**< Quantity of a new pages added */ uint64_t cow; /**< Quantity of pages copied for update */ diff --git a/mdbx/dist/mdbx_chk.c b/mdbx/dist/mdbx_chk.c index 21a3898..f9b17d7 100644 --- a/mdbx/dist/mdbx_chk.c +++ b/mdbx/dist/mdbx_chk.c @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY 10aa116f5f6a1fca4ccea1310d3d331a39161abc5b63b6a30e01812eab671e7c_v0_10_0_0_gaa1f6fbd +#define MDBX_BUILD_SOURCERY 220eee3b3a4b48cb20897d772c6665de65226e5811d687b6f7f43b2beeb9cb31_v0_10_1_15_g63e7276c #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -228,14 +228,6 @@ # endif #endif /* __must_check_result */ -#ifndef __maybe_unused -# if defined(__GNUC__) || __has_attribute(__unused__) -# define __maybe_unused __attribute__((__unused__)) -# else -# define __maybe_unused -# endif -#endif /* __maybe_unused */ - #if !defined(__noop) && !defined(_MSC_VER) # define __noop(...) do {} while(0) #endif /* __noop */ @@ -381,7 +373,7 @@ #endif /* __anonymous_struct_extension__ */ #ifndef __Wpedantic_format_voidptr - static __inline __maybe_unused const void* MDBX_PURE_FUNCTION + MDBX_MAYBE_UNUSED MDBX_PURE_FUNCTION static __inline const void* __Wpedantic_format_voidptr(const void* ptr) {return ptr;} # define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG) #endif /* __Wpedantic_format_voidptr */ @@ -993,7 +985,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t mdbx_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) SYSTEM_INFO si; @@ -1080,7 +1072,7 @@ extern void mdbx_osal_jitter(bool tiny); #include #endif -static __maybe_unused __inline void mdbx_compiler_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) __asm__ __volatile__("" ::: "memory"); #elif defined(_MSC_VER) @@ -1100,7 +1092,7 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { #endif } -static __maybe_unused __inline void mdbx_memory_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) @@ -1140,8 +1132,8 @@ static __maybe_unused __inline void mdbx_memory_barrier(void) { #define mdbx_asprintf asprintf #define mdbx_vasprintf vasprintf #else -MDBX_INTERNAL_FUNC MDBX_PRINTF_ARGS(2, 3) int __maybe_unused - mdbx_asprintf(char **strp, const char *fmt, ...); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC + MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...); MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #endif @@ -1164,7 +1156,7 @@ MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; LIBMDBX_API char *mdbx_strdup(const char *str); #endif -static __maybe_unused __inline int mdbx_get_errno(void) { +MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { #if defined(_WIN32) || defined(_WIN64) DWORD rc = GetLastError(); #else @@ -1248,8 +1240,10 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, const size_t must, const size_t limit, const unsigned options); MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); -MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, - size_t wanna, const bool may_move); +#define MDBX_MRESIZE_MAY_MOVE 0x00000100 +#define MDBX_MRESIZE_MAY_UNMAP 0x00000200 +MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, + size_t size, size_t limit); #if defined(_WIN32) || defined(_WIN64) typedef struct { unsigned limit, count; @@ -1266,7 +1260,7 @@ MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, const char *pathname, int err); -static __maybe_unused __inline uint32_t mdbx_getpid(void) { +MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentProcessId(); @@ -1275,7 +1269,7 @@ static __maybe_unused __inline uint32_t mdbx_getpid(void) { #endif } -static __maybe_unused __inline uintptr_t mdbx_thread_self(void) { +MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { mdbx_tid_t thunk; STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); #if defined(_WIN32) || defined(_WIN64) @@ -1286,7 +1280,7 @@ static __maybe_unused __inline uintptr_t mdbx_thread_self(void) { return (uintptr_t)thunk; } -MDBX_INTERNAL_FUNC void __maybe_unused mdbx_osal_jitter(bool tiny); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny); MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void); MDBX_INTERNAL_FUNC uint64_t mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); @@ -2057,7 +2051,7 @@ static __always_inline memory_order mo_c11_load(enum MDBX_memory_order fence) { static __inline void mdbx_jitter4testing(bool tiny); -static __maybe_unused __always_inline void +MDBX_MAYBE_UNUSED static __always_inline void mdbx_memory_fence(enum MDBX_memory_order order, bool write) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(write ? mo_c11_store(order) : mo_c11_load(order)); @@ -2069,7 +2063,7 @@ mdbx_memory_fence(enum MDBX_memory_order order, bool write) { #endif /* MDBX_HAVE_C11ATOMICS */ } -static __maybe_unused __always_inline uint32_t +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); @@ -2085,7 +2079,7 @@ atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, return value; } -static __maybe_unused __always_inline uint32_t +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); #ifdef MDBX_HAVE_C11ATOMICS @@ -2100,7 +2094,7 @@ atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { #endif /* MDBX_HAVE_C11ATOMICS */ } -static __maybe_unused __always_inline uint64_t +MDBX_MAYBE_UNUSED static __always_inline uint64_t atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8); @@ -2124,7 +2118,7 @@ atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, return value; } -static __maybe_unused +MDBX_MAYBE_UNUSED static #if MDBX_64BIT_ATOMIC __always_inline #endif /* MDBX_64BIT_ATOMIC */ @@ -2712,15 +2706,7 @@ typedef struct MDBX_dbx { * Every operation requires a transaction handle. */ struct MDBX_txn { #define MDBX_MT_SIGNATURE UINT32_C(0x93D53A31) - size_t mt_signature; - MDBX_txn *mt_parent; /* parent of a nested txn */ - /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ - MDBX_txn *mt_child; - MDBX_geo mt_geo; - /* next unallocated page */ -#define mt_next_pgno mt_geo.next - /* corresponding to the current size of datafile */ -#define mt_end_pgno mt_geo.now + uint32_t mt_signature; /* Transaction Flags */ /* mdbx_txn_begin() flags */ @@ -2749,8 +2735,17 @@ struct MDBX_txn { MDBX_SHRINK_ALLOWED) #error "Oops, some flags overlapped or wrong" #endif + uint32_t mt_flags; + + MDBX_txn *mt_parent; /* parent of a nested txn */ + /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ + MDBX_txn *mt_child; + MDBX_geo mt_geo; + /* next unallocated page */ +#define mt_next_pgno mt_geo.next + /* corresponding to the current size of datafile */ +#define mt_end_pgno mt_geo.now - unsigned mt_flags; /* The ID of this transaction. IDs are integers incrementing from 1. * Only committed write transactions increment the ID. If a transaction * aborts, the ID may be re-used by the next writer. */ @@ -3170,7 +3165,7 @@ void mdbx_assert_fail(const MDBX_env *env, const char *msg, const char *func, #define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier() #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ -static __maybe_unused __inline void +MDBX_MAYBE_UNUSED static __inline void mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); @@ -3212,7 +3207,7 @@ MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void); MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void); MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); -static __maybe_unused __inline void mdbx_jitter4testing(bool tiny) { +MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) { #if MDBX_DEBUG if (MDBX_DBG_JITTER & mdbx_runtime_flags) mdbx_osal_jitter(tiny); @@ -3365,35 +3360,35 @@ typedef struct MDBX_node { /* Do not spill pages to disk if txn is getting full, may fail instead */ #define MDBX_NOSPILL 0x8000 -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline pgno_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_add(pgno_t base, pgno_t augend) { assert(base <= MAX_PAGENO); return (augend < MAX_PAGENO - base) ? base + augend : MAX_PAGENO; } -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline pgno_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_sub(pgno_t base, pgno_t subtrahend) { assert(base >= MIN_PAGENO); return (subtrahend < base - MIN_PAGENO) ? base - subtrahend : MIN_PAGENO; } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused bool +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool is_powerof2(size_t x) { return (x & (x - 1)) == 0; } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t floor_powerof2(size_t value, size_t granularity) { assert(is_powerof2(granularity)); return value & ~(granularity - 1); } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t ceil_powerof2(size_t value, size_t granularity) { return floor_powerof2(value + granularity - 1, granularity); } -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused unsigned +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned log2n_powerof2(size_t value) { assert(value > 0 && value < INT32_MAX && is_powerof2(value)); assert((value & -(int32_t)value) == value); @@ -3423,7 +3418,7 @@ log2n_powerof2(size_t value) { #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS -static __maybe_unused void static_checks(void) { +MDBX_MAYBE_UNUSED static void static_checks(void) { STATIC_ASSERT_MSG(INT16_MAX - CORE_DBS == MDBX_MAX_DBI, "Oops, MDBX_MAX_DBI or CORE_DBS?"); STATIC_ASSERT_MSG((unsigned)(MDBX_DB_ACCEDE | MDBX_CREATE) == @@ -4616,7 +4611,6 @@ int main(int argc, char *argv[]) { quiet = true; break; case 'n': - envflags |= MDBX_NOSUBDIR; break; case 'w': envflags &= ~MDBX_RDONLY; @@ -4879,11 +4873,6 @@ int main(int argc, char *argv[]) { "of may by large than the database itself,\n " "until it will be closed or reopened in read-write mode.\n"); #endif - print(" - transactions: recent %" PRIu64 ", latter reader %" PRIu64 - ", lag %" PRIi64 "\n", - envinfo.mi_recent_txnid, envinfo.mi_latter_reader_txnid, - envinfo.mi_recent_txnid - envinfo.mi_latter_reader_txnid); - verbose_meta(0, envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, envinfo.mi_bootid.meta0.x, envinfo.mi_bootid.meta0.y); verbose_meta(1, envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, @@ -4892,52 +4881,70 @@ int main(int argc, char *argv[]) { envinfo.mi_bootid.meta2.x, envinfo.mi_bootid.meta2.y); } - if (verbose > 1) - print(" - performs check for meta-pages clashes\n"); - if (meta_eq(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, - envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign)) { - print(" ! meta-%d and meta-%d are clashed\n", 0, 1); - ++problems_meta; - } - if (meta_eq(envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, - envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign)) { - print(" ! meta-%d and meta-%d are clashed\n", 1, 2); - ++problems_meta; - } - if (meta_eq(envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, - envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign)) { - print(" ! meta-%d and meta-%d are clashed\n", 2, 0); - ++problems_meta; - } - - const unsigned steady_meta_id = meta_recent(true); - const uint64_t steady_meta_txnid = get_meta_txnid(steady_meta_id); - const unsigned weak_meta_id = meta_recent(false); - const uint64_t weak_meta_txnid = get_meta_txnid(weak_meta_id); - if (envflags & MDBX_EXCLUSIVE) { + if (stuck_meta >= 0) { + if (verbose) { + print(" - skip checking meta-pages since the %u" + " is selected for verification\n", + stuck_meta); + print(" - transactions: recent %" PRIu64 + ", selected for verification %" PRIu64 ", lag %" PRIi64 "\n", + envinfo.mi_recent_txnid, get_meta_txnid(stuck_meta), + envinfo.mi_recent_txnid - get_meta_txnid(stuck_meta)); + } + } else { if (verbose > 1) - print(" - performs full check recent-txn-id with meta-pages\n"); - if (steady_meta_txnid != envinfo.mi_recent_txnid) { - print(" ! steady meta-%d txn-id mismatch recent-txn-id (%" PRIi64 - " != %" PRIi64 ")\n", - steady_meta_id, steady_meta_txnid, envinfo.mi_recent_txnid); + print(" - performs check for meta-pages clashes\n"); + if (meta_eq(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, + envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign)) { + print(" ! meta-%d and meta-%d are clashed\n", 0, 1); ++problems_meta; } - } else if (write_locked) { - if (verbose > 1) - print(" - performs lite check recent-txn-id with meta-pages (not a " - "monopolistic mode)\n"); - if (weak_meta_txnid != envinfo.mi_recent_txnid) { - print(" ! weak meta-%d txn-id mismatch recent-txn-id (%" PRIi64 - " != %" PRIi64 ")\n", - weak_meta_id, weak_meta_txnid, envinfo.mi_recent_txnid); + if (meta_eq(envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, + envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign)) { + print(" ! meta-%d and meta-%d are clashed\n", 1, 2); ++problems_meta; } - } else if (verbose) { - print(" - skip check recent-txn-id with meta-pages (monopolistic or " - "read-write mode only)\n"); + if (meta_eq(envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, + envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign)) { + print(" ! meta-%d and meta-%d are clashed\n", 2, 0); + ++problems_meta; + } + + const unsigned steady_meta_id = meta_recent(true); + const uint64_t steady_meta_txnid = get_meta_txnid(steady_meta_id); + const unsigned weak_meta_id = meta_recent(false); + const uint64_t weak_meta_txnid = get_meta_txnid(weak_meta_id); + if (envflags & MDBX_EXCLUSIVE) { + if (verbose > 1) + print(" - performs full check recent-txn-id with meta-pages\n"); + if (steady_meta_txnid != envinfo.mi_recent_txnid) { + print(" ! steady meta-%d txn-id mismatch recent-txn-id (%" PRIi64 + " != %" PRIi64 ")\n", + steady_meta_id, steady_meta_txnid, envinfo.mi_recent_txnid); + ++problems_meta; + } + } else if (write_locked) { + if (verbose > 1) + print(" - performs lite check recent-txn-id with meta-pages (not a " + "monopolistic mode)\n"); + if (weak_meta_txnid != envinfo.mi_recent_txnid) { + print(" ! weak meta-%d txn-id mismatch recent-txn-id (%" PRIi64 + " != %" PRIi64 ")\n", + weak_meta_id, weak_meta_txnid, envinfo.mi_recent_txnid); + ++problems_meta; + } + } else if (verbose) { + print(" - skip check recent-txn-id with meta-pages (monopolistic or " + "read-write mode only)\n"); + } + total_problems += problems_meta; + + if (verbose) + print(" - transactions: recent %" PRIu64 ", latter reader %" PRIu64 + ", lag %" PRIi64 "\n", + envinfo.mi_recent_txnid, envinfo.mi_latter_reader_txnid, + envinfo.mi_recent_txnid - envinfo.mi_latter_reader_txnid); } - total_problems += problems_meta; if (!dont_traversal) { struct problem *saved_list; @@ -5106,7 +5113,7 @@ int main(int argc, char *argv[]) { if (rc == 0 && total_problems == 1 && problems_meta == 1 && !dont_traversal && (envflags & MDBX_RDONLY) == 0 && !only_subdb && stuck_meta < 0 && - steady_meta_txnid < envinfo.mi_recent_txnid) { + get_meta_txnid(meta_recent(true)) < envinfo.mi_recent_txnid) { print("Perform sync-to-disk for make steady checkpoint at txn-id #%" PRIi64 "\n", envinfo.mi_recent_txnid); diff --git a/mdbx/dist/mdbx_copy.c b/mdbx/dist/mdbx_copy.c index 14faac5..5ef8ff1 100644 --- a/mdbx/dist/mdbx_copy.c +++ b/mdbx/dist/mdbx_copy.c @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY 10aa116f5f6a1fca4ccea1310d3d331a39161abc5b63b6a30e01812eab671e7c_v0_10_0_0_gaa1f6fbd +#define MDBX_BUILD_SOURCERY 220eee3b3a4b48cb20897d772c6665de65226e5811d687b6f7f43b2beeb9cb31_v0_10_1_15_g63e7276c #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -228,14 +228,6 @@ # endif #endif /* __must_check_result */ -#ifndef __maybe_unused -# if defined(__GNUC__) || __has_attribute(__unused__) -# define __maybe_unused __attribute__((__unused__)) -# else -# define __maybe_unused -# endif -#endif /* __maybe_unused */ - #if !defined(__noop) && !defined(_MSC_VER) # define __noop(...) do {} while(0) #endif /* __noop */ @@ -381,7 +373,7 @@ #endif /* __anonymous_struct_extension__ */ #ifndef __Wpedantic_format_voidptr - static __inline __maybe_unused const void* MDBX_PURE_FUNCTION + MDBX_MAYBE_UNUSED MDBX_PURE_FUNCTION static __inline const void* __Wpedantic_format_voidptr(const void* ptr) {return ptr;} # define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG) #endif /* __Wpedantic_format_voidptr */ @@ -993,7 +985,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t mdbx_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) SYSTEM_INFO si; @@ -1080,7 +1072,7 @@ extern void mdbx_osal_jitter(bool tiny); #include #endif -static __maybe_unused __inline void mdbx_compiler_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) __asm__ __volatile__("" ::: "memory"); #elif defined(_MSC_VER) @@ -1100,7 +1092,7 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { #endif } -static __maybe_unused __inline void mdbx_memory_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) @@ -1140,8 +1132,8 @@ static __maybe_unused __inline void mdbx_memory_barrier(void) { #define mdbx_asprintf asprintf #define mdbx_vasprintf vasprintf #else -MDBX_INTERNAL_FUNC MDBX_PRINTF_ARGS(2, 3) int __maybe_unused - mdbx_asprintf(char **strp, const char *fmt, ...); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC + MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...); MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #endif @@ -1164,7 +1156,7 @@ MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; LIBMDBX_API char *mdbx_strdup(const char *str); #endif -static __maybe_unused __inline int mdbx_get_errno(void) { +MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { #if defined(_WIN32) || defined(_WIN64) DWORD rc = GetLastError(); #else @@ -1248,8 +1240,10 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, const size_t must, const size_t limit, const unsigned options); MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); -MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, - size_t wanna, const bool may_move); +#define MDBX_MRESIZE_MAY_MOVE 0x00000100 +#define MDBX_MRESIZE_MAY_UNMAP 0x00000200 +MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, + size_t size, size_t limit); #if defined(_WIN32) || defined(_WIN64) typedef struct { unsigned limit, count; @@ -1266,7 +1260,7 @@ MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, const char *pathname, int err); -static __maybe_unused __inline uint32_t mdbx_getpid(void) { +MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentProcessId(); @@ -1275,7 +1269,7 @@ static __maybe_unused __inline uint32_t mdbx_getpid(void) { #endif } -static __maybe_unused __inline uintptr_t mdbx_thread_self(void) { +MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { mdbx_tid_t thunk; STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); #if defined(_WIN32) || defined(_WIN64) @@ -1286,7 +1280,7 @@ static __maybe_unused __inline uintptr_t mdbx_thread_self(void) { return (uintptr_t)thunk; } -MDBX_INTERNAL_FUNC void __maybe_unused mdbx_osal_jitter(bool tiny); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny); MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void); MDBX_INTERNAL_FUNC uint64_t mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); @@ -2057,7 +2051,7 @@ static __always_inline memory_order mo_c11_load(enum MDBX_memory_order fence) { static __inline void mdbx_jitter4testing(bool tiny); -static __maybe_unused __always_inline void +MDBX_MAYBE_UNUSED static __always_inline void mdbx_memory_fence(enum MDBX_memory_order order, bool write) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(write ? mo_c11_store(order) : mo_c11_load(order)); @@ -2069,7 +2063,7 @@ mdbx_memory_fence(enum MDBX_memory_order order, bool write) { #endif /* MDBX_HAVE_C11ATOMICS */ } -static __maybe_unused __always_inline uint32_t +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); @@ -2085,7 +2079,7 @@ atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, return value; } -static __maybe_unused __always_inline uint32_t +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); #ifdef MDBX_HAVE_C11ATOMICS @@ -2100,7 +2094,7 @@ atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { #endif /* MDBX_HAVE_C11ATOMICS */ } -static __maybe_unused __always_inline uint64_t +MDBX_MAYBE_UNUSED static __always_inline uint64_t atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8); @@ -2124,7 +2118,7 @@ atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, return value; } -static __maybe_unused +MDBX_MAYBE_UNUSED static #if MDBX_64BIT_ATOMIC __always_inline #endif /* MDBX_64BIT_ATOMIC */ @@ -2712,15 +2706,7 @@ typedef struct MDBX_dbx { * Every operation requires a transaction handle. */ struct MDBX_txn { #define MDBX_MT_SIGNATURE UINT32_C(0x93D53A31) - size_t mt_signature; - MDBX_txn *mt_parent; /* parent of a nested txn */ - /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ - MDBX_txn *mt_child; - MDBX_geo mt_geo; - /* next unallocated page */ -#define mt_next_pgno mt_geo.next - /* corresponding to the current size of datafile */ -#define mt_end_pgno mt_geo.now + uint32_t mt_signature; /* Transaction Flags */ /* mdbx_txn_begin() flags */ @@ -2749,8 +2735,17 @@ struct MDBX_txn { MDBX_SHRINK_ALLOWED) #error "Oops, some flags overlapped or wrong" #endif + uint32_t mt_flags; + + MDBX_txn *mt_parent; /* parent of a nested txn */ + /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ + MDBX_txn *mt_child; + MDBX_geo mt_geo; + /* next unallocated page */ +#define mt_next_pgno mt_geo.next + /* corresponding to the current size of datafile */ +#define mt_end_pgno mt_geo.now - unsigned mt_flags; /* The ID of this transaction. IDs are integers incrementing from 1. * Only committed write transactions increment the ID. If a transaction * aborts, the ID may be re-used by the next writer. */ @@ -3170,7 +3165,7 @@ void mdbx_assert_fail(const MDBX_env *env, const char *msg, const char *func, #define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier() #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ -static __maybe_unused __inline void +MDBX_MAYBE_UNUSED static __inline void mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); @@ -3212,7 +3207,7 @@ MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void); MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void); MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); -static __maybe_unused __inline void mdbx_jitter4testing(bool tiny) { +MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) { #if MDBX_DEBUG if (MDBX_DBG_JITTER & mdbx_runtime_flags) mdbx_osal_jitter(tiny); @@ -3365,35 +3360,35 @@ typedef struct MDBX_node { /* Do not spill pages to disk if txn is getting full, may fail instead */ #define MDBX_NOSPILL 0x8000 -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline pgno_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_add(pgno_t base, pgno_t augend) { assert(base <= MAX_PAGENO); return (augend < MAX_PAGENO - base) ? base + augend : MAX_PAGENO; } -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline pgno_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_sub(pgno_t base, pgno_t subtrahend) { assert(base >= MIN_PAGENO); return (subtrahend < base - MIN_PAGENO) ? base - subtrahend : MIN_PAGENO; } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused bool +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool is_powerof2(size_t x) { return (x & (x - 1)) == 0; } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t floor_powerof2(size_t value, size_t granularity) { assert(is_powerof2(granularity)); return value & ~(granularity - 1); } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t ceil_powerof2(size_t value, size_t granularity) { return floor_powerof2(value + granularity - 1, granularity); } -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused unsigned +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned log2n_powerof2(size_t value) { assert(value > 0 && value < INT32_MAX && is_powerof2(value)); assert((value & -(int32_t)value) == value); @@ -3423,7 +3418,7 @@ log2n_powerof2(size_t value) { #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS -static __maybe_unused void static_checks(void) { +MDBX_MAYBE_UNUSED static void static_checks(void) { STATIC_ASSERT_MSG(INT16_MAX - CORE_DBS == MDBX_MAX_DBI, "Oops, MDBX_MAX_DBI or CORE_DBS?"); STATIC_ASSERT_MSG((unsigned)(MDBX_DB_ACCEDE | MDBX_CREATE) == diff --git a/mdbx/dist/mdbx_drop.c b/mdbx/dist/mdbx_drop.c index 7fdbab0..f8e09fd 100644 --- a/mdbx/dist/mdbx_drop.c +++ b/mdbx/dist/mdbx_drop.c @@ -36,7 +36,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY 10aa116f5f6a1fca4ccea1310d3d331a39161abc5b63b6a30e01812eab671e7c_v0_10_0_0_gaa1f6fbd +#define MDBX_BUILD_SOURCERY 220eee3b3a4b48cb20897d772c6665de65226e5811d687b6f7f43b2beeb9cb31_v0_10_1_15_g63e7276c #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -230,14 +230,6 @@ # endif #endif /* __must_check_result */ -#ifndef __maybe_unused -# if defined(__GNUC__) || __has_attribute(__unused__) -# define __maybe_unused __attribute__((__unused__)) -# else -# define __maybe_unused -# endif -#endif /* __maybe_unused */ - #if !defined(__noop) && !defined(_MSC_VER) # define __noop(...) do {} while(0) #endif /* __noop */ @@ -383,7 +375,7 @@ #endif /* __anonymous_struct_extension__ */ #ifndef __Wpedantic_format_voidptr - static __inline __maybe_unused const void* MDBX_PURE_FUNCTION + MDBX_MAYBE_UNUSED MDBX_PURE_FUNCTION static __inline const void* __Wpedantic_format_voidptr(const void* ptr) {return ptr;} # define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG) #endif /* __Wpedantic_format_voidptr */ @@ -995,7 +987,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t mdbx_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) SYSTEM_INFO si; @@ -1082,7 +1074,7 @@ extern void mdbx_osal_jitter(bool tiny); #include #endif -static __maybe_unused __inline void mdbx_compiler_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) __asm__ __volatile__("" ::: "memory"); #elif defined(_MSC_VER) @@ -1102,7 +1094,7 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { #endif } -static __maybe_unused __inline void mdbx_memory_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) @@ -1142,8 +1134,8 @@ static __maybe_unused __inline void mdbx_memory_barrier(void) { #define mdbx_asprintf asprintf #define mdbx_vasprintf vasprintf #else -MDBX_INTERNAL_FUNC MDBX_PRINTF_ARGS(2, 3) int __maybe_unused - mdbx_asprintf(char **strp, const char *fmt, ...); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC + MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...); MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #endif @@ -1166,7 +1158,7 @@ MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; LIBMDBX_API char *mdbx_strdup(const char *str); #endif -static __maybe_unused __inline int mdbx_get_errno(void) { +MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { #if defined(_WIN32) || defined(_WIN64) DWORD rc = GetLastError(); #else @@ -1250,8 +1242,10 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, const size_t must, const size_t limit, const unsigned options); MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); -MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, - size_t wanna, const bool may_move); +#define MDBX_MRESIZE_MAY_MOVE 0x00000100 +#define MDBX_MRESIZE_MAY_UNMAP 0x00000200 +MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, + size_t size, size_t limit); #if defined(_WIN32) || defined(_WIN64) typedef struct { unsigned limit, count; @@ -1268,7 +1262,7 @@ MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, const char *pathname, int err); -static __maybe_unused __inline uint32_t mdbx_getpid(void) { +MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentProcessId(); @@ -1277,7 +1271,7 @@ static __maybe_unused __inline uint32_t mdbx_getpid(void) { #endif } -static __maybe_unused __inline uintptr_t mdbx_thread_self(void) { +MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { mdbx_tid_t thunk; STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); #if defined(_WIN32) || defined(_WIN64) @@ -1288,7 +1282,7 @@ static __maybe_unused __inline uintptr_t mdbx_thread_self(void) { return (uintptr_t)thunk; } -MDBX_INTERNAL_FUNC void __maybe_unused mdbx_osal_jitter(bool tiny); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny); MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void); MDBX_INTERNAL_FUNC uint64_t mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); @@ -2059,7 +2053,7 @@ static __always_inline memory_order mo_c11_load(enum MDBX_memory_order fence) { static __inline void mdbx_jitter4testing(bool tiny); -static __maybe_unused __always_inline void +MDBX_MAYBE_UNUSED static __always_inline void mdbx_memory_fence(enum MDBX_memory_order order, bool write) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(write ? mo_c11_store(order) : mo_c11_load(order)); @@ -2071,7 +2065,7 @@ mdbx_memory_fence(enum MDBX_memory_order order, bool write) { #endif /* MDBX_HAVE_C11ATOMICS */ } -static __maybe_unused __always_inline uint32_t +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); @@ -2087,7 +2081,7 @@ atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, return value; } -static __maybe_unused __always_inline uint32_t +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); #ifdef MDBX_HAVE_C11ATOMICS @@ -2102,7 +2096,7 @@ atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { #endif /* MDBX_HAVE_C11ATOMICS */ } -static __maybe_unused __always_inline uint64_t +MDBX_MAYBE_UNUSED static __always_inline uint64_t atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8); @@ -2126,7 +2120,7 @@ atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, return value; } -static __maybe_unused +MDBX_MAYBE_UNUSED static #if MDBX_64BIT_ATOMIC __always_inline #endif /* MDBX_64BIT_ATOMIC */ @@ -2714,15 +2708,7 @@ typedef struct MDBX_dbx { * Every operation requires a transaction handle. */ struct MDBX_txn { #define MDBX_MT_SIGNATURE UINT32_C(0x93D53A31) - size_t mt_signature; - MDBX_txn *mt_parent; /* parent of a nested txn */ - /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ - MDBX_txn *mt_child; - MDBX_geo mt_geo; - /* next unallocated page */ -#define mt_next_pgno mt_geo.next - /* corresponding to the current size of datafile */ -#define mt_end_pgno mt_geo.now + uint32_t mt_signature; /* Transaction Flags */ /* mdbx_txn_begin() flags */ @@ -2751,8 +2737,17 @@ struct MDBX_txn { MDBX_SHRINK_ALLOWED) #error "Oops, some flags overlapped or wrong" #endif + uint32_t mt_flags; + + MDBX_txn *mt_parent; /* parent of a nested txn */ + /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ + MDBX_txn *mt_child; + MDBX_geo mt_geo; + /* next unallocated page */ +#define mt_next_pgno mt_geo.next + /* corresponding to the current size of datafile */ +#define mt_end_pgno mt_geo.now - unsigned mt_flags; /* The ID of this transaction. IDs are integers incrementing from 1. * Only committed write transactions increment the ID. If a transaction * aborts, the ID may be re-used by the next writer. */ @@ -3172,7 +3167,7 @@ void mdbx_assert_fail(const MDBX_env *env, const char *msg, const char *func, #define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier() #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ -static __maybe_unused __inline void +MDBX_MAYBE_UNUSED static __inline void mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); @@ -3214,7 +3209,7 @@ MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void); MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void); MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); -static __maybe_unused __inline void mdbx_jitter4testing(bool tiny) { +MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) { #if MDBX_DEBUG if (MDBX_DBG_JITTER & mdbx_runtime_flags) mdbx_osal_jitter(tiny); @@ -3367,35 +3362,35 @@ typedef struct MDBX_node { /* Do not spill pages to disk if txn is getting full, may fail instead */ #define MDBX_NOSPILL 0x8000 -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline pgno_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_add(pgno_t base, pgno_t augend) { assert(base <= MAX_PAGENO); return (augend < MAX_PAGENO - base) ? base + augend : MAX_PAGENO; } -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline pgno_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_sub(pgno_t base, pgno_t subtrahend) { assert(base >= MIN_PAGENO); return (subtrahend < base - MIN_PAGENO) ? base - subtrahend : MIN_PAGENO; } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused bool +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool is_powerof2(size_t x) { return (x & (x - 1)) == 0; } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t floor_powerof2(size_t value, size_t granularity) { assert(is_powerof2(granularity)); return value & ~(granularity - 1); } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t ceil_powerof2(size_t value, size_t granularity) { return floor_powerof2(value + granularity - 1, granularity); } -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused unsigned +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned log2n_powerof2(size_t value) { assert(value > 0 && value < INT32_MAX && is_powerof2(value)); assert((value & -(int32_t)value) == value); @@ -3425,7 +3420,7 @@ log2n_powerof2(size_t value) { #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS -static __maybe_unused void static_checks(void) { +MDBX_MAYBE_UNUSED static void static_checks(void) { STATIC_ASSERT_MSG(INT16_MAX - CORE_DBS == MDBX_MAX_DBI, "Oops, MDBX_MAX_DBI or CORE_DBS?"); STATIC_ASSERT_MSG((unsigned)(MDBX_DB_ACCEDE | MDBX_CREATE) == @@ -3556,7 +3551,9 @@ static void usage(void) { "usage: %s [-V] [-q] [-d] [-s name] dbpath\n" " -V\t\tprint version and exit\n" " -q\t\tbe quiet\n" - " -d\t\tdelete the specified database, don't just empty it\n", + " -d\t\tdelete the specified database, don't just empty it\n" + " -s name\tdrop the specified named subDB\n" + " \t\tby default empty the main DB\n", prog); exit(EXIT_FAILURE); } @@ -3581,11 +3578,12 @@ int main(int argc, char *argv[]) { if (argc < 2) usage(); - /* -d: delete the db, don't just empty it - * -s: drop the named subDB - * -V: print version and exit - * (default) empty the main DB */ - while ((i = getopt(argc, argv, "ds:nV")) != EOF) { + while ((i = getopt(argc, argv, + "d" + "s:" + "n" + "q" + "V")) != EOF) { switch (i) { case 'V': printf("mdbx_drop version %d.%d.%d.%d\n" @@ -3601,11 +3599,13 @@ int main(int argc, char *argv[]) { mdbx_build.target, mdbx_build.compiler, mdbx_build.flags, mdbx_build.options); return EXIT_SUCCESS; + case 'q': + quiet = true; + break; case 'd': delete = true; break; case 'n': - envflags |= MDBX_NOSUBDIR; break; case 's': subname = optarg; @@ -3632,11 +3632,12 @@ int main(int argc, char *argv[]) { #endif /* !WINDOWS */ envname = argv[optind]; - if (!quiet) + if (!quiet) { printf("mdbx_drop %s (%s, T-%s)\nRunning for %s/%s...\n", mdbx_version.git.describe, mdbx_version.git.datetime, mdbx_version.git.tree, envname, subname ? subname : "@MAIN"); - fflush(nullptr); + fflush(nullptr); + } rc = mdbx_env_create(&env); if (unlikely(rc != MDBX_SUCCESS)) { diff --git a/mdbx/dist/mdbx_dump.c b/mdbx/dist/mdbx_dump.c index 83fcc9d..2808bf3 100644 --- a/mdbx/dist/mdbx_dump.c +++ b/mdbx/dist/mdbx_dump.c @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY 10aa116f5f6a1fca4ccea1310d3d331a39161abc5b63b6a30e01812eab671e7c_v0_10_0_0_gaa1f6fbd +#define MDBX_BUILD_SOURCERY 220eee3b3a4b48cb20897d772c6665de65226e5811d687b6f7f43b2beeb9cb31_v0_10_1_15_g63e7276c #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -228,14 +228,6 @@ # endif #endif /* __must_check_result */ -#ifndef __maybe_unused -# if defined(__GNUC__) || __has_attribute(__unused__) -# define __maybe_unused __attribute__((__unused__)) -# else -# define __maybe_unused -# endif -#endif /* __maybe_unused */ - #if !defined(__noop) && !defined(_MSC_VER) # define __noop(...) do {} while(0) #endif /* __noop */ @@ -381,7 +373,7 @@ #endif /* __anonymous_struct_extension__ */ #ifndef __Wpedantic_format_voidptr - static __inline __maybe_unused const void* MDBX_PURE_FUNCTION + MDBX_MAYBE_UNUSED MDBX_PURE_FUNCTION static __inline const void* __Wpedantic_format_voidptr(const void* ptr) {return ptr;} # define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG) #endif /* __Wpedantic_format_voidptr */ @@ -993,7 +985,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t mdbx_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) SYSTEM_INFO si; @@ -1080,7 +1072,7 @@ extern void mdbx_osal_jitter(bool tiny); #include #endif -static __maybe_unused __inline void mdbx_compiler_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) __asm__ __volatile__("" ::: "memory"); #elif defined(_MSC_VER) @@ -1100,7 +1092,7 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { #endif } -static __maybe_unused __inline void mdbx_memory_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) @@ -1140,8 +1132,8 @@ static __maybe_unused __inline void mdbx_memory_barrier(void) { #define mdbx_asprintf asprintf #define mdbx_vasprintf vasprintf #else -MDBX_INTERNAL_FUNC MDBX_PRINTF_ARGS(2, 3) int __maybe_unused - mdbx_asprintf(char **strp, const char *fmt, ...); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC + MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...); MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #endif @@ -1164,7 +1156,7 @@ MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; LIBMDBX_API char *mdbx_strdup(const char *str); #endif -static __maybe_unused __inline int mdbx_get_errno(void) { +MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { #if defined(_WIN32) || defined(_WIN64) DWORD rc = GetLastError(); #else @@ -1248,8 +1240,10 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, const size_t must, const size_t limit, const unsigned options); MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); -MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, - size_t wanna, const bool may_move); +#define MDBX_MRESIZE_MAY_MOVE 0x00000100 +#define MDBX_MRESIZE_MAY_UNMAP 0x00000200 +MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, + size_t size, size_t limit); #if defined(_WIN32) || defined(_WIN64) typedef struct { unsigned limit, count; @@ -1266,7 +1260,7 @@ MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, const char *pathname, int err); -static __maybe_unused __inline uint32_t mdbx_getpid(void) { +MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentProcessId(); @@ -1275,7 +1269,7 @@ static __maybe_unused __inline uint32_t mdbx_getpid(void) { #endif } -static __maybe_unused __inline uintptr_t mdbx_thread_self(void) { +MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { mdbx_tid_t thunk; STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); #if defined(_WIN32) || defined(_WIN64) @@ -1286,7 +1280,7 @@ static __maybe_unused __inline uintptr_t mdbx_thread_self(void) { return (uintptr_t)thunk; } -MDBX_INTERNAL_FUNC void __maybe_unused mdbx_osal_jitter(bool tiny); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny); MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void); MDBX_INTERNAL_FUNC uint64_t mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); @@ -2057,7 +2051,7 @@ static __always_inline memory_order mo_c11_load(enum MDBX_memory_order fence) { static __inline void mdbx_jitter4testing(bool tiny); -static __maybe_unused __always_inline void +MDBX_MAYBE_UNUSED static __always_inline void mdbx_memory_fence(enum MDBX_memory_order order, bool write) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(write ? mo_c11_store(order) : mo_c11_load(order)); @@ -2069,7 +2063,7 @@ mdbx_memory_fence(enum MDBX_memory_order order, bool write) { #endif /* MDBX_HAVE_C11ATOMICS */ } -static __maybe_unused __always_inline uint32_t +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); @@ -2085,7 +2079,7 @@ atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, return value; } -static __maybe_unused __always_inline uint32_t +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); #ifdef MDBX_HAVE_C11ATOMICS @@ -2100,7 +2094,7 @@ atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { #endif /* MDBX_HAVE_C11ATOMICS */ } -static __maybe_unused __always_inline uint64_t +MDBX_MAYBE_UNUSED static __always_inline uint64_t atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8); @@ -2124,7 +2118,7 @@ atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, return value; } -static __maybe_unused +MDBX_MAYBE_UNUSED static #if MDBX_64BIT_ATOMIC __always_inline #endif /* MDBX_64BIT_ATOMIC */ @@ -2712,15 +2706,7 @@ typedef struct MDBX_dbx { * Every operation requires a transaction handle. */ struct MDBX_txn { #define MDBX_MT_SIGNATURE UINT32_C(0x93D53A31) - size_t mt_signature; - MDBX_txn *mt_parent; /* parent of a nested txn */ - /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ - MDBX_txn *mt_child; - MDBX_geo mt_geo; - /* next unallocated page */ -#define mt_next_pgno mt_geo.next - /* corresponding to the current size of datafile */ -#define mt_end_pgno mt_geo.now + uint32_t mt_signature; /* Transaction Flags */ /* mdbx_txn_begin() flags */ @@ -2749,8 +2735,17 @@ struct MDBX_txn { MDBX_SHRINK_ALLOWED) #error "Oops, some flags overlapped or wrong" #endif + uint32_t mt_flags; + + MDBX_txn *mt_parent; /* parent of a nested txn */ + /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ + MDBX_txn *mt_child; + MDBX_geo mt_geo; + /* next unallocated page */ +#define mt_next_pgno mt_geo.next + /* corresponding to the current size of datafile */ +#define mt_end_pgno mt_geo.now - unsigned mt_flags; /* The ID of this transaction. IDs are integers incrementing from 1. * Only committed write transactions increment the ID. If a transaction * aborts, the ID may be re-used by the next writer. */ @@ -3170,7 +3165,7 @@ void mdbx_assert_fail(const MDBX_env *env, const char *msg, const char *func, #define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier() #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ -static __maybe_unused __inline void +MDBX_MAYBE_UNUSED static __inline void mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); @@ -3212,7 +3207,7 @@ MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void); MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void); MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); -static __maybe_unused __inline void mdbx_jitter4testing(bool tiny) { +MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) { #if MDBX_DEBUG if (MDBX_DBG_JITTER & mdbx_runtime_flags) mdbx_osal_jitter(tiny); @@ -3365,35 +3360,35 @@ typedef struct MDBX_node { /* Do not spill pages to disk if txn is getting full, may fail instead */ #define MDBX_NOSPILL 0x8000 -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline pgno_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_add(pgno_t base, pgno_t augend) { assert(base <= MAX_PAGENO); return (augend < MAX_PAGENO - base) ? base + augend : MAX_PAGENO; } -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline pgno_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_sub(pgno_t base, pgno_t subtrahend) { assert(base >= MIN_PAGENO); return (subtrahend < base - MIN_PAGENO) ? base - subtrahend : MIN_PAGENO; } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused bool +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool is_powerof2(size_t x) { return (x & (x - 1)) == 0; } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t floor_powerof2(size_t value, size_t granularity) { assert(is_powerof2(granularity)); return value & ~(granularity - 1); } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t ceil_powerof2(size_t value, size_t granularity) { return floor_powerof2(value + granularity - 1, granularity); } -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused unsigned +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned log2n_powerof2(size_t value) { assert(value > 0 && value < INT32_MAX && is_powerof2(value)); assert((value & -(int32_t)value) == value); @@ -3423,7 +3418,7 @@ log2n_powerof2(size_t value) { #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS -static __maybe_unused void static_checks(void) { +MDBX_MAYBE_UNUSED static void static_checks(void) { STATIC_ASSERT_MSG(INT16_MAX - CORE_DBS == MDBX_MAX_DBI, "Oops, MDBX_MAX_DBI or CORE_DBS?"); STATIC_ASSERT_MSG((unsigned)(MDBX_DB_ACCEDE | MDBX_CREATE) == @@ -3603,7 +3598,9 @@ static void dumpval(MDBX_val *v) { bool quiet = false, rescue = false; const char *prog; static void error(const char *func, int rc) { - fprintf(stderr, "%s: %s() error %d %s\n", prog, func, rc, mdbx_strerror(rc)); + if (!quiet) + fprintf(stderr, "%s: %s() error %d %s\n", prog, func, rc, + mdbx_strerror(rc)); } /* Dump in BDB-compatible format */ @@ -3718,17 +3715,17 @@ static int dump_sdb(MDBX_txn *txn, MDBX_dbi dbi, char *name) { static void usage(void) { fprintf(stderr, - "usage: %s [-V] [-q] [-f file] [-l] [-p] [-a|-s subdb] [-r] " + "usage: %s [-V] [-q] [-f file] [-l] [-p] [-r] [-a|-s subdb] " "dbpath\n" " -V\t\tprint version and exit\n" " -q\t\tbe quiet\n" " -f\t\twrite to file instead of stdout\n" " -l\t\tlist subDBs and exit\n" " -p\t\tuse printable characters\n" - " -a\t\tdump main DB and all subDBs,\n" - " \t\tby default dump only the main DB\n" - " -s\t\tdump only the named subDB\n" - " -r\t\trescue mode (ignore errors to dump corrupted DB)\n", + " -r\t\trescue mode (ignore errors to dump corrupted DB)\n" + " -a\t\tdump main DB and all subDBs\n" + " -s name\tdump only the specified named subDB\n" + " \t\tby default dump only the main DB\n", prog); exit(EXIT_FAILURE); } @@ -3796,7 +3793,6 @@ int main(int argc, char *argv[]) { } break; case 'n': - envflags |= MDBX_NOSUBDIR; break; case 'p': mode |= PRINT; @@ -3933,8 +3929,9 @@ int main(int argc, char *argv[]) { if (unlikely(rc != MDBX_SUCCESS)) { if (!rescue) break; - fprintf(stderr, "%s: %s: ignore %s for `%s` and continue\n", prog, - envname, mdbx_strerror(rc), subname); + if (!quiet) + fprintf(stderr, "%s: %s: ignore %s for `%s` and continue\n", prog, + envname, mdbx_strerror(rc), subname); /* Here is a hack for rescue mode, don't do that: * - we should restart transaction in case error due * database corruption; @@ -3968,8 +3965,9 @@ int main(int argc, char *argv[]) { if (have_raw && (!count /* || rescue */)) rc = dump_sdb(txn, MAIN_DBI, nullptr); else if (!count) { - fprintf(stderr, "%s: %s does not contain multiple databases\n", prog, - envname); + if (!quiet) + fprintf(stderr, "%s: %s does not contain multiple databases\n", prog, + envname); rc = MDBX_NOTFOUND; } } else { @@ -3982,7 +3980,8 @@ int main(int argc, char *argv[]) { case MDBX_SUCCESS: break; case MDBX_EINTR: - fprintf(stderr, "Interrupted by signal/user\n"); + if (!quiet) + fprintf(stderr, "Interrupted by signal/user\n"); break; default: if (unlikely(rc != MDBX_SUCCESS)) diff --git a/mdbx/dist/mdbx_load.c b/mdbx/dist/mdbx_load.c index 3393924..0082699 100644 --- a/mdbx/dist/mdbx_load.c +++ b/mdbx/dist/mdbx_load.c @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY 10aa116f5f6a1fca4ccea1310d3d331a39161abc5b63b6a30e01812eab671e7c_v0_10_0_0_gaa1f6fbd +#define MDBX_BUILD_SOURCERY 220eee3b3a4b48cb20897d772c6665de65226e5811d687b6f7f43b2beeb9cb31_v0_10_1_15_g63e7276c #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -228,14 +228,6 @@ # endif #endif /* __must_check_result */ -#ifndef __maybe_unused -# if defined(__GNUC__) || __has_attribute(__unused__) -# define __maybe_unused __attribute__((__unused__)) -# else -# define __maybe_unused -# endif -#endif /* __maybe_unused */ - #if !defined(__noop) && !defined(_MSC_VER) # define __noop(...) do {} while(0) #endif /* __noop */ @@ -381,7 +373,7 @@ #endif /* __anonymous_struct_extension__ */ #ifndef __Wpedantic_format_voidptr - static __inline __maybe_unused const void* MDBX_PURE_FUNCTION + MDBX_MAYBE_UNUSED MDBX_PURE_FUNCTION static __inline const void* __Wpedantic_format_voidptr(const void* ptr) {return ptr;} # define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG) #endif /* __Wpedantic_format_voidptr */ @@ -993,7 +985,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t mdbx_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) SYSTEM_INFO si; @@ -1080,7 +1072,7 @@ extern void mdbx_osal_jitter(bool tiny); #include #endif -static __maybe_unused __inline void mdbx_compiler_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) __asm__ __volatile__("" ::: "memory"); #elif defined(_MSC_VER) @@ -1100,7 +1092,7 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { #endif } -static __maybe_unused __inline void mdbx_memory_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) @@ -1140,8 +1132,8 @@ static __maybe_unused __inline void mdbx_memory_barrier(void) { #define mdbx_asprintf asprintf #define mdbx_vasprintf vasprintf #else -MDBX_INTERNAL_FUNC MDBX_PRINTF_ARGS(2, 3) int __maybe_unused - mdbx_asprintf(char **strp, const char *fmt, ...); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC + MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...); MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #endif @@ -1164,7 +1156,7 @@ MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; LIBMDBX_API char *mdbx_strdup(const char *str); #endif -static __maybe_unused __inline int mdbx_get_errno(void) { +MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { #if defined(_WIN32) || defined(_WIN64) DWORD rc = GetLastError(); #else @@ -1248,8 +1240,10 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, const size_t must, const size_t limit, const unsigned options); MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); -MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, - size_t wanna, const bool may_move); +#define MDBX_MRESIZE_MAY_MOVE 0x00000100 +#define MDBX_MRESIZE_MAY_UNMAP 0x00000200 +MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, + size_t size, size_t limit); #if defined(_WIN32) || defined(_WIN64) typedef struct { unsigned limit, count; @@ -1266,7 +1260,7 @@ MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, const char *pathname, int err); -static __maybe_unused __inline uint32_t mdbx_getpid(void) { +MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentProcessId(); @@ -1275,7 +1269,7 @@ static __maybe_unused __inline uint32_t mdbx_getpid(void) { #endif } -static __maybe_unused __inline uintptr_t mdbx_thread_self(void) { +MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { mdbx_tid_t thunk; STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); #if defined(_WIN32) || defined(_WIN64) @@ -1286,7 +1280,7 @@ static __maybe_unused __inline uintptr_t mdbx_thread_self(void) { return (uintptr_t)thunk; } -MDBX_INTERNAL_FUNC void __maybe_unused mdbx_osal_jitter(bool tiny); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny); MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void); MDBX_INTERNAL_FUNC uint64_t mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); @@ -2057,7 +2051,7 @@ static __always_inline memory_order mo_c11_load(enum MDBX_memory_order fence) { static __inline void mdbx_jitter4testing(bool tiny); -static __maybe_unused __always_inline void +MDBX_MAYBE_UNUSED static __always_inline void mdbx_memory_fence(enum MDBX_memory_order order, bool write) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(write ? mo_c11_store(order) : mo_c11_load(order)); @@ -2069,7 +2063,7 @@ mdbx_memory_fence(enum MDBX_memory_order order, bool write) { #endif /* MDBX_HAVE_C11ATOMICS */ } -static __maybe_unused __always_inline uint32_t +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); @@ -2085,7 +2079,7 @@ atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, return value; } -static __maybe_unused __always_inline uint32_t +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); #ifdef MDBX_HAVE_C11ATOMICS @@ -2100,7 +2094,7 @@ atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { #endif /* MDBX_HAVE_C11ATOMICS */ } -static __maybe_unused __always_inline uint64_t +MDBX_MAYBE_UNUSED static __always_inline uint64_t atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8); @@ -2124,7 +2118,7 @@ atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, return value; } -static __maybe_unused +MDBX_MAYBE_UNUSED static #if MDBX_64BIT_ATOMIC __always_inline #endif /* MDBX_64BIT_ATOMIC */ @@ -2712,15 +2706,7 @@ typedef struct MDBX_dbx { * Every operation requires a transaction handle. */ struct MDBX_txn { #define MDBX_MT_SIGNATURE UINT32_C(0x93D53A31) - size_t mt_signature; - MDBX_txn *mt_parent; /* parent of a nested txn */ - /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ - MDBX_txn *mt_child; - MDBX_geo mt_geo; - /* next unallocated page */ -#define mt_next_pgno mt_geo.next - /* corresponding to the current size of datafile */ -#define mt_end_pgno mt_geo.now + uint32_t mt_signature; /* Transaction Flags */ /* mdbx_txn_begin() flags */ @@ -2749,8 +2735,17 @@ struct MDBX_txn { MDBX_SHRINK_ALLOWED) #error "Oops, some flags overlapped or wrong" #endif + uint32_t mt_flags; + + MDBX_txn *mt_parent; /* parent of a nested txn */ + /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ + MDBX_txn *mt_child; + MDBX_geo mt_geo; + /* next unallocated page */ +#define mt_next_pgno mt_geo.next + /* corresponding to the current size of datafile */ +#define mt_end_pgno mt_geo.now - unsigned mt_flags; /* The ID of this transaction. IDs are integers incrementing from 1. * Only committed write transactions increment the ID. If a transaction * aborts, the ID may be re-used by the next writer. */ @@ -3170,7 +3165,7 @@ void mdbx_assert_fail(const MDBX_env *env, const char *msg, const char *func, #define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier() #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ -static __maybe_unused __inline void +MDBX_MAYBE_UNUSED static __inline void mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); @@ -3212,7 +3207,7 @@ MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void); MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void); MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); -static __maybe_unused __inline void mdbx_jitter4testing(bool tiny) { +MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) { #if MDBX_DEBUG if (MDBX_DBG_JITTER & mdbx_runtime_flags) mdbx_osal_jitter(tiny); @@ -3365,35 +3360,35 @@ typedef struct MDBX_node { /* Do not spill pages to disk if txn is getting full, may fail instead */ #define MDBX_NOSPILL 0x8000 -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline pgno_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_add(pgno_t base, pgno_t augend) { assert(base <= MAX_PAGENO); return (augend < MAX_PAGENO - base) ? base + augend : MAX_PAGENO; } -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline pgno_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_sub(pgno_t base, pgno_t subtrahend) { assert(base >= MIN_PAGENO); return (subtrahend < base - MIN_PAGENO) ? base - subtrahend : MIN_PAGENO; } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused bool +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool is_powerof2(size_t x) { return (x & (x - 1)) == 0; } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t floor_powerof2(size_t value, size_t granularity) { assert(is_powerof2(granularity)); return value & ~(granularity - 1); } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t ceil_powerof2(size_t value, size_t granularity) { return floor_powerof2(value + granularity - 1, granularity); } -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused unsigned +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned log2n_powerof2(size_t value) { assert(value > 0 && value < INT32_MAX && is_powerof2(value)); assert((value & -(int32_t)value) == value); @@ -3423,7 +3418,7 @@ log2n_powerof2(size_t value) { #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS -static __maybe_unused void static_checks(void) { +MDBX_MAYBE_UNUSED static void static_checks(void) { STATIC_ASSERT_MSG(INT16_MAX - CORE_DBS == MDBX_MAX_DBI, "Oops, MDBX_MAX_DBI or CORE_DBS?"); STATIC_ASSERT_MSG((unsigned)(MDBX_DB_ACCEDE | MDBX_CREATE) == @@ -3987,7 +3982,7 @@ static void usage(void) { " -a\t\tappend records in input order (required for custom " "comparators)\n" " -f file\tread from file instead of stdin\n" - " -s name\tload into named subDB\n" + " -s name\tload into specified named subDB\n" " -N\t\tdon't overwrite existing records when loading, just skip " "ones\n" " -p\t\tpurge subDB before loading\n" diff --git a/mdbx/dist/mdbx_stat.c b/mdbx/dist/mdbx_stat.c index a077526..2ab1136 100644 --- a/mdbx/dist/mdbx_stat.c +++ b/mdbx/dist/mdbx_stat.c @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY 10aa116f5f6a1fca4ccea1310d3d331a39161abc5b63b6a30e01812eab671e7c_v0_10_0_0_gaa1f6fbd +#define MDBX_BUILD_SOURCERY 220eee3b3a4b48cb20897d772c6665de65226e5811d687b6f7f43b2beeb9cb31_v0_10_1_15_g63e7276c #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -228,14 +228,6 @@ # endif #endif /* __must_check_result */ -#ifndef __maybe_unused -# if defined(__GNUC__) || __has_attribute(__unused__) -# define __maybe_unused __attribute__((__unused__)) -# else -# define __maybe_unused -# endif -#endif /* __maybe_unused */ - #if !defined(__noop) && !defined(_MSC_VER) # define __noop(...) do {} while(0) #endif /* __noop */ @@ -381,7 +373,7 @@ #endif /* __anonymous_struct_extension__ */ #ifndef __Wpedantic_format_voidptr - static __inline __maybe_unused const void* MDBX_PURE_FUNCTION + MDBX_MAYBE_UNUSED MDBX_PURE_FUNCTION static __inline const void* __Wpedantic_format_voidptr(const void* ptr) {return ptr;} # define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG) #endif /* __Wpedantic_format_voidptr */ @@ -993,7 +985,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t mdbx_syspagesize(void) { #if defined(_WIN32) || defined(_WIN64) SYSTEM_INFO si; @@ -1080,7 +1072,7 @@ extern void mdbx_osal_jitter(bool tiny); #include #endif -static __maybe_unused __inline void mdbx_compiler_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) __asm__ __volatile__("" ::: "memory"); #elif defined(_MSC_VER) @@ -1100,7 +1092,7 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) { #endif } -static __maybe_unused __inline void mdbx_memory_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) @@ -1140,8 +1132,8 @@ static __maybe_unused __inline void mdbx_memory_barrier(void) { #define mdbx_asprintf asprintf #define mdbx_vasprintf vasprintf #else -MDBX_INTERNAL_FUNC MDBX_PRINTF_ARGS(2, 3) int __maybe_unused - mdbx_asprintf(char **strp, const char *fmt, ...); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC + MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...); MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #endif @@ -1164,7 +1156,7 @@ MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; LIBMDBX_API char *mdbx_strdup(const char *str); #endif -static __maybe_unused __inline int mdbx_get_errno(void) { +MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { #if defined(_WIN32) || defined(_WIN64) DWORD rc = GetLastError(); #else @@ -1248,8 +1240,10 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, const size_t must, const size_t limit, const unsigned options); MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); -MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, - size_t wanna, const bool may_move); +#define MDBX_MRESIZE_MAY_MOVE 0x00000100 +#define MDBX_MRESIZE_MAY_UNMAP 0x00000200 +MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, + size_t size, size_t limit); #if defined(_WIN32) || defined(_WIN64) typedef struct { unsigned limit, count; @@ -1266,7 +1260,7 @@ MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, const char *pathname, int err); -static __maybe_unused __inline uint32_t mdbx_getpid(void) { +MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentProcessId(); @@ -1275,7 +1269,7 @@ static __maybe_unused __inline uint32_t mdbx_getpid(void) { #endif } -static __maybe_unused __inline uintptr_t mdbx_thread_self(void) { +MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { mdbx_tid_t thunk; STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); #if defined(_WIN32) || defined(_WIN64) @@ -1286,7 +1280,7 @@ static __maybe_unused __inline uintptr_t mdbx_thread_self(void) { return (uintptr_t)thunk; } -MDBX_INTERNAL_FUNC void __maybe_unused mdbx_osal_jitter(bool tiny); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny); MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void); MDBX_INTERNAL_FUNC uint64_t mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); @@ -2057,7 +2051,7 @@ static __always_inline memory_order mo_c11_load(enum MDBX_memory_order fence) { static __inline void mdbx_jitter4testing(bool tiny); -static __maybe_unused __always_inline void +MDBX_MAYBE_UNUSED static __always_inline void mdbx_memory_fence(enum MDBX_memory_order order, bool write) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(write ? mo_c11_store(order) : mo_c11_load(order)); @@ -2069,7 +2063,7 @@ mdbx_memory_fence(enum MDBX_memory_order order, bool write) { #endif /* MDBX_HAVE_C11ATOMICS */ } -static __maybe_unused __always_inline uint32_t +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); @@ -2085,7 +2079,7 @@ atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, return value; } -static __maybe_unused __always_inline uint32_t +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); #ifdef MDBX_HAVE_C11ATOMICS @@ -2100,7 +2094,7 @@ atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { #endif /* MDBX_HAVE_C11ATOMICS */ } -static __maybe_unused __always_inline uint64_t +MDBX_MAYBE_UNUSED static __always_inline uint64_t atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8); @@ -2124,7 +2118,7 @@ atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, return value; } -static __maybe_unused +MDBX_MAYBE_UNUSED static #if MDBX_64BIT_ATOMIC __always_inline #endif /* MDBX_64BIT_ATOMIC */ @@ -2712,15 +2706,7 @@ typedef struct MDBX_dbx { * Every operation requires a transaction handle. */ struct MDBX_txn { #define MDBX_MT_SIGNATURE UINT32_C(0x93D53A31) - size_t mt_signature; - MDBX_txn *mt_parent; /* parent of a nested txn */ - /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ - MDBX_txn *mt_child; - MDBX_geo mt_geo; - /* next unallocated page */ -#define mt_next_pgno mt_geo.next - /* corresponding to the current size of datafile */ -#define mt_end_pgno mt_geo.now + uint32_t mt_signature; /* Transaction Flags */ /* mdbx_txn_begin() flags */ @@ -2749,8 +2735,17 @@ struct MDBX_txn { MDBX_SHRINK_ALLOWED) #error "Oops, some flags overlapped or wrong" #endif + uint32_t mt_flags; + + MDBX_txn *mt_parent; /* parent of a nested txn */ + /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ + MDBX_txn *mt_child; + MDBX_geo mt_geo; + /* next unallocated page */ +#define mt_next_pgno mt_geo.next + /* corresponding to the current size of datafile */ +#define mt_end_pgno mt_geo.now - unsigned mt_flags; /* The ID of this transaction. IDs are integers incrementing from 1. * Only committed write transactions increment the ID. If a transaction * aborts, the ID may be re-used by the next writer. */ @@ -3170,7 +3165,7 @@ void mdbx_assert_fail(const MDBX_env *env, const char *msg, const char *func, #define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier() #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ -static __maybe_unused __inline void +MDBX_MAYBE_UNUSED static __inline void mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); @@ -3212,7 +3207,7 @@ MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void); MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void); MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); -static __maybe_unused __inline void mdbx_jitter4testing(bool tiny) { +MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) { #if MDBX_DEBUG if (MDBX_DBG_JITTER & mdbx_runtime_flags) mdbx_osal_jitter(tiny); @@ -3365,35 +3360,35 @@ typedef struct MDBX_node { /* Do not spill pages to disk if txn is getting full, may fail instead */ #define MDBX_NOSPILL 0x8000 -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline pgno_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_add(pgno_t base, pgno_t augend) { assert(base <= MAX_PAGENO); return (augend < MAX_PAGENO - base) ? base + augend : MAX_PAGENO; } -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused __inline pgno_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_sub(pgno_t base, pgno_t subtrahend) { assert(base >= MIN_PAGENO); return (subtrahend < base - MIN_PAGENO) ? base - subtrahend : MIN_PAGENO; } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused bool +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool is_powerof2(size_t x) { return (x & (x - 1)) == 0; } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t floor_powerof2(size_t value, size_t granularity) { assert(is_powerof2(granularity)); return value & ~(granularity - 1); } -MDBX_NOTHROW_CONST_FUNCTION static __always_inline __maybe_unused size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t ceil_powerof2(size_t value, size_t granularity) { return floor_powerof2(value + granularity - 1, granularity); } -MDBX_NOTHROW_CONST_FUNCTION static __maybe_unused unsigned +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned log2n_powerof2(size_t value) { assert(value > 0 && value < INT32_MAX && is_powerof2(value)); assert((value & -(int32_t)value) == value); @@ -3423,7 +3418,7 @@ log2n_powerof2(size_t value) { #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS -static __maybe_unused void static_checks(void) { +MDBX_MAYBE_UNUSED static void static_checks(void) { STATIC_ASSERT_MSG(INT16_MAX - CORE_DBS == MDBX_MAX_DBI, "Oops, MDBX_MAX_DBI or CORE_DBS?"); STATIC_ASSERT_MSG((unsigned)(MDBX_DB_ACCEDE | MDBX_CREATE) == @@ -3556,14 +3551,16 @@ static void print_stat(MDBX_stat *ms) { static void usage(const char *prog) { fprintf(stderr, - "usage: %s [-V] [-e] [-f[f[f]]] [-r[r]] [-a|-s name] dbpath\n" + "usage: %s [-V] [-q] [-e] [-f[f[f]]] [-r[r]] [-a|-s name] dbpath\n" " -V\t\tprint version and exit\n" + " -q\t\tbe quiet\n" + " -p\t\tshow statistics of page operations for current session\n" " -e\t\tshow whole DB info\n" " -f\t\tshow GC info\n" " -r\t\tshow readers\n" " -a\t\tprint stat of main DB and all subDBs\n" - " \t\t(default) print stat of only the main DB\n" - " -s name\tprint stat of only the named subDB\n", + " -s name\tprint stat of only the specified named subDB\n" + " \t\tby default print stat of only the main DB\n", prog); exit(EXIT_FAILURE); } @@ -3590,12 +3587,15 @@ static int reader_list_func(void *ctx, int num, int slot, mdbx_pid_t pid, } const char *prog; +bool quiet = false; static void error(const char *func, int rc) { - fprintf(stderr, "%s: %s() error %d %s\n", prog, func, rc, mdbx_strerror(rc)); + if (!quiet) + fprintf(stderr, "%s: %s() error %d %s\n", prog, func, rc, + mdbx_strerror(rc)); } int main(int argc, char *argv[]) { - int o, rc; + int opt, rc; MDBX_env *env; MDBX_txn *txn; MDBX_dbi dbi; @@ -3603,20 +3603,23 @@ int main(int argc, char *argv[]) { prog = argv[0]; char *envname; char *subname = nullptr; - int alldbs = 0, envinfo = 0, envflags = 0, freinfo = 0, rdrinfo = 0; + bool alldbs = false, envinfo = false, pgop = false; + int freinfo = 0, rdrinfo = 0; if (argc < 2) usage(prog); - while ((o = getopt(argc, argv, - "V" - "a" - "e" - "f" - "n" - "r" - "s:")) != EOF) { - switch (o) { + while ((opt = getopt(argc, argv, + "V" + "q" + "p" + "a" + "e" + "f" + "n" + "r" + "s:")) != EOF) { + switch (opt) { case 'V': printf("mdbx_stat version %d.%d.%d.%d\n" " - source: %s %s, commit %s, tree %s\n" @@ -3631,22 +3634,27 @@ int main(int argc, char *argv[]) { mdbx_build.target, mdbx_build.compiler, mdbx_build.flags, mdbx_build.options); return EXIT_SUCCESS; + case 'q': + quiet = true; + break; + case 'p': + pgop = true; + break; case 'a': if (subname) usage(prog); - alldbs++; + alldbs = true; break; case 'e': - envinfo++; + envinfo = true; break; case 'f': - freinfo++; + freinfo += 1; break; case 'n': - envflags |= MDBX_NOSUBDIR; break; case 'r': - rdrinfo++; + rdrinfo += 1; break; case 's': if (alldbs) @@ -3676,10 +3684,12 @@ int main(int argc, char *argv[]) { envname = argv[optind]; envname = argv[optind]; - printf("mdbx_stat %s (%s, T-%s)\nRunning for %s...\n", - mdbx_version.git.describe, mdbx_version.git.datetime, - mdbx_version.git.tree, envname); - fflush(nullptr); + if (!quiet) { + printf("mdbx_stat %s (%s, T-%s)\nRunning for %s...\n", + mdbx_version.git.describe, mdbx_version.git.datetime, + mdbx_version.git.tree, envname); + fflush(nullptr); + } rc = mdbx_env_create(&env); if (unlikely(rc != MDBX_SUCCESS)) { @@ -3695,7 +3705,7 @@ int main(int argc, char *argv[]) { } } - rc = mdbx_env_open(env, envname, envflags | MDBX_RDONLY, 0); + rc = mdbx_env_open(env, envname, MDBX_RDONLY, 0); if (unlikely(rc != MDBX_SUCCESS)) { error("mdbx_env_open", rc); goto env_close; @@ -3707,7 +3717,7 @@ int main(int argc, char *argv[]) { goto txn_abort; } - if (envinfo || freinfo) { + if (envinfo || freinfo || pgop) { rc = mdbx_env_info_ex(env, txn, &mei, sizeof(mei)); if (unlikely(rc != MDBX_SUCCESS)) { error("mdbx_env_info_ex", rc); @@ -3718,6 +3728,33 @@ int main(int argc, char *argv[]) { memset(&mei, 0, sizeof(mei)); } + if (pgop) { + printf("Page Operations (for current session):\n"); + printf(" New: %8" PRIu64 "\t// quantity of a new pages added\n", + mei.mi_pgop_stat.newly); + printf(" CoW: %8" PRIu64 + "\t// quantity of pages copied for altering\n", + mei.mi_pgop_stat.cow); + printf(" Clone: %8" PRIu64 "\t// quantity of parent's dirty pages " + "clones for nested transactions\n", + mei.mi_pgop_stat.clone); + printf(" Split: %8" PRIu64 + "\t// page splits during insertions or updates\n", + mei.mi_pgop_stat.split); + printf(" Merge: %8" PRIu64 + "\t// page merges during deletions or updates\n", + mei.mi_pgop_stat.merge); + printf(" Spill: %8" PRIu64 "\t// quantity of spilled/ousted `dirty` " + "pages during large transactions\n", + mei.mi_pgop_stat.spill); + printf(" Unspill: %8" PRIu64 "\t// quantity of unspilled/redone `dirty` " + "pages during large transactions\n", + mei.mi_pgop_stat.unspill); + printf(" WOP: %8" PRIu64 + "\t// number of explicit write operations (not a pages) to a disk\n", + mei.mi_pgop_stat.wops); + } + if (envinfo) { printf("Environment Info\n"); printf(" Pagesize: %u\n", mei.mi_dxb_pagesize); @@ -3858,7 +3895,8 @@ int main(int argc, char *argv[]) { case MDBX_NOTFOUND: break; case MDBX_EINTR: - fprintf(stderr, "Interrupted by signal/user\n"); + if (!quiet) + fprintf(stderr, "Interrupted by signal/user\n"); goto txn_abort; default: error("mdbx_cursor_get", rc); @@ -3964,7 +4002,8 @@ int main(int argc, char *argv[]) { case MDBX_NOTFOUND: break; case MDBX_EINTR: - fprintf(stderr, "Interrupted by signal/user\n"); + if (!quiet) + fprintf(stderr, "Interrupted by signal/user\n"); break; default: if (unlikely(rc != MDBX_SUCCESS)) diff --git a/mdbx/env.go b/mdbx/env.go index b6b0c60..901aaf4 100644 --- a/mdbx/env.go +++ b/mdbx/env.go @@ -14,10 +14,13 @@ import ( "unsafe" ) -// success is a value returned from the LMDB API to indicate a successful call. +// success is a value returned from the MDBX API to indicate a successful call. // The functions in this API this behavior and its use is not required. const success = C.MDBX_SUCCESS +const Major = C.MDBX_VERSION_MAJOR +const Minor = C.MDBX_VERSION_MINOR + const ( // Flags for Env.Open. // @@ -36,10 +39,10 @@ const ( SafeNoSync = C.MDBX_SAFE_NOSYNC Durable = C.MDBX_SYNC_DURABLE NoTLS = C.MDBX_NOTLS // Danger zone. When unset reader locktable slots are tied to their thread. - //NoLock = C.MDBX_NOLOCK // Danger zone. LMDB does not use any locks. + //NoLock = C.MDBX_NOLOCK // Danger zone. MDBX does not use any locks. NoReadahead = C.MDBX_NORDAHEAD // Disable readahead. Requires OS support. - NoMemInit = C.MDBX_NOMEMINIT // Disable LMDB memory initialization. - Exclusive = C.MDBX_EXCLUSIVE // Disable LMDB memory initialization. + NoMemInit = C.MDBX_NOMEMINIT // Disable MDBX memory initialization. + Exclusive = C.MDBX_EXCLUSIVE // Disable MDBX memory initialization. ) const ( @@ -62,6 +65,8 @@ const ( AllowTxOverlap = C.MDBX_DBG_LEGACY_OVERLAP ) +type LogLvl = C.MDBX_log_level_t + const ( LogLvlFatal = C.MDBX_LOG_FATAL LogLvlError = C.MDBX_LOG_ERROR @@ -97,6 +102,7 @@ const ( OptSpillMaxDenominator = C.MDBX_opt_spill_max_denominator OptSpillMinDenominator = C.MDBX_opt_spill_min_denominator OptSpillParent4ChildDenominator = C.MDBX_opt_spill_parent4child_denominator + OptMergeThreshold16dot16Percent = C.MDBX_opt_merge_threshold_16dot16_percent ) var ( @@ -114,13 +120,12 @@ type DBI C.MDBX_dbi // See MDBX_env. type Env struct { _env *C.MDBX_env + ckey *C.MDBX_val + cval *C.MDBX_val // closeLock is used to allow the Txn finalizer to check if the Env has // been closed, so that it may know if it must abort. closeLock sync.RWMutex - - ckey *C.MDBX_val - cval *C.MDBX_val } // NewEnv allocates and initializes a new Env. @@ -134,8 +139,6 @@ func NewEnv() (*Env, error) { } env.ckey = (*C.MDBX_val)(C.malloc(C.size_t(unsafe.Sizeof(C.MDBX_val{})))) env.cval = (*C.MDBX_val)(C.malloc(C.size_t(unsafe.Sizeof(C.MDBX_val{})))) - - runtime.SetFinalizer(env, (*Env).Close) return env, nil } @@ -151,7 +154,6 @@ func (env *Env) Open(path string, flags uint, mode os.FileMode) error { } var errNotOpen = errors.New("enivornment is not open") -var errNegSize = errors.New("negative size") // FD returns the open file descriptor (or Windows file handle) for the given // environment. An error is returned if the environment has not been @@ -159,7 +161,7 @@ var errNegSize = errors.New("negative size") // // See mdbx_env_get_fd. func (env *Env) FD() (uintptr, error) { - // fdInvalid is the value -1 as a uintptr, which is used by LMDB in the + // fdInvalid is the value -1 as a uintptr, which is used by MDBX in the // case that env has not been opened yet. the strange construction is done // to avoid constant value overflow errors at compile time. const fdInvalid = ^uintptr(0) @@ -213,9 +215,13 @@ func (env *Env) ReaderCheck() (int, error) { return int(_dead), operrno("mdbx_reader_check", ret) } -func (env *Env) close() bool { +// Close shuts down the environment, releases the memory map, and clears the +// finalizer on env. +// +// See mdbx_env_close. +func (env *Env) Close() { if env._env == nil { - return false + return } env.closeLock.Lock() @@ -227,19 +233,6 @@ func (env *Env) close() bool { C.free(unsafe.Pointer(env.cval)) env.ckey = nil env.cval = nil - return true -} - -// Close shuts down the environment, releases the memory map, and clears the -// finalizer on env. -// -// See mdbx_env_close. -func (env *Env) Close() error { - if env.close() { - runtime.SetFinalizer(env, nil) - return nil - } - return errors.New("environment is already closed") } // CopyFD copies env to the the file descriptor fd. @@ -376,16 +369,16 @@ func (env *Env) Info() (*EnvInfo, error) { Shrink: uint64(_info.mi_geo.shrink), Grow: uint64(_info.mi_geo.grow), }, - //PageOps: EnfInfoPageOps{ - // Newly: uint64(_info.mi_pgop_stat.newly), - // Cow: uint64(_info.mi_pgop_stat.cow), - // Clone: uint64(_info.mi_pgop_stat.clone), - // Split: uint64(_info.mi_pgop_stat.split), - // Merge: uint64(_info.mi_pgop_stat.merge), - // Spill: uint64(_info.mi_pgop_stat.spill), - // Unspill: uint64(_info.mi_pgop_stat.unspill), - // Wops: uint64(_info.mi_pgop_stat.wops), - //}, + PageOps: EnfInfoPageOps{ + Newly: uint64(_info.mi_pgop_stat.newly), + Cow: uint64(_info.mi_pgop_stat.cow), + Clone: uint64(_info.mi_pgop_stat.clone), + Split: uint64(_info.mi_pgop_stat.split), + Merge: uint64(_info.mi_pgop_stat.merge), + Spill: uint64(_info.mi_pgop_stat.spill), + Unspill: uint64(_info.mi_pgop_stat.unspill), + Wops: uint64(_info.mi_pgop_stat.wops), + }, LastPNO: int64(_info.mi_last_pgno), LastTxnID: int64(_info.mi_recent_txnid), MaxReaders: uint(_info.mi_maxreaders), @@ -439,9 +432,9 @@ func (env *Env) Flags() (uint, error) { return uint(_flags), nil } -func (env *Env) SetDebug(logLvl int, dbg int, logger *C.MDBX_debug_func) error { - ret := C.mdbx_setup_debug(C.MDBX_log_level_t(logLvl), C.MDBX_debug_flags_t(dbg), logger) - return operrno("mdbx_setup_debug", ret) +func (env *Env) SetDebug(logLvl LogLvl, dbg int, logger *C.MDBX_debug_func) error { + _ = C.mdbx_setup_debug(logLvl, C.MDBX_debug_flags_t(dbg), logger) + return nil } // Path returns the path argument passed to Open. Path returns a non-nil error @@ -465,6 +458,12 @@ func (env *Env) SetOption(option uint, value uint64) error { return operrno("mdbx_env_set_option", ret) } +func (env *Env) GetOption(option uint) (uint64, error) { + var res C.uint64_t + ret := C.mdbx_env_get_option(env._env, C.MDBX_option_t(option), &res) + return uint64(res), operrno("mdbx_env_get_option", ret) +} + func (env *Env) SetGeometry(sizeLower int, sizeNow int, sizeUpper int, growthStep int, shrinkThreshold int, pageSize int) error { ret := C.mdbx_env_set_geometry(env._env, C.intptr_t(sizeLower), @@ -476,26 +475,6 @@ func (env *Env) SetGeometry(sizeLower int, sizeNow int, sizeUpper int, growthSte return operrno("mdbx_env_set_geometry", ret) } -// SetMaxReaders sets the maximum number of reader slots in the environment. -// -// See mdbx_env_set_maxreaders. -func (env *Env) SetMaxReaders(size int) error { - if size < 0 { - return errNegSize - } - ret := C.mdbx_env_set_maxreaders(env._env, C.uint(size)) - return operrno("mdbx_env_set_maxreaders", ret) -} - -// MaxReaders returns the maximum number of reader slots for the environment. -// -// See mdbx_env_get_maxreaders. -func (env *Env) MaxReaders() (int, error) { - var max C.uint - ret := C.mdbx_env_get_maxreaders(env._env, &max) - return int(max), operrno("mdbx_env_get_maxreaders", ret) -} - // MaxKeySize returns the maximum allowed length for a key. // // See mdbx_env_get_maxkeysize. @@ -506,17 +485,6 @@ func (env *Env) MaxKeySize() int { return int(C.mdbx_env_get_maxkeysize_ex(env._env, 0)) } -// SetMaxDBs sets the maximum number of named databases for the environment. -// -// See mdbx_env_set_maxdbs. -func (env *Env) SetMaxDBs(size int) error { - if size < 0 { - return errNegSize - } - ret := C.mdbx_env_set_maxdbs(env._env, C.MDBX_dbi(size)) - return operrno("mdbx_env_set_maxdbs", ret) -} - // BeginTxn is an unsafe, low-level method to initialize a new transaction on // env. The Txn returned by BeginTxn is unmanaged and must be terminated by // calling either its Abort or Commit methods to ensure that its resources are diff --git a/mdbx/env_test.go b/mdbx/env_test.go index 4dcf2e0..3b7eb83 100644 --- a/mdbx/env_test.go +++ b/mdbx/env_test.go @@ -2,10 +2,8 @@ package mdbx import ( "fmt" - "io/ioutil" - "os" + "runtime" "strings" - "syscall" "testing" ) @@ -33,12 +31,7 @@ func TestEnv_Path(t *testing.T) { } // open an environment - dir, err := ioutil.TempDir("", "mdb_test") - if err != nil { - t.Fatalf("tempdir: %v", err) - } - defer os.RemoveAll(dir) - + dir := t.TempDir() err = env.Open(dir, 0, 0644) defer env.Close() if err != nil { @@ -73,35 +66,26 @@ func TestEnv_Open(t *testing.T) { t.Error(err1) return } - defer func() { - if err := env.Close(); err != nil { - t.Error(err) - } - }() + defer env.Close() // open an environment at a temporary path. - path, err := ioutil.TempDir("", "mdb_test") - if err != nil { - t.Fatalf("tempdir: %v", err) - } - defer os.RemoveAll(path) - err = env.Open(path, 0, 0664) + path := t.TempDir() + err := env.Open(path, 0, 0664) if err != nil { t.Errorf("open: %s", err) } } func TestEnv_FD(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("FD funcs not supported on windows") + } env, err1 := NewEnv() if err1 != nil { t.Error(err1) return } - defer func() { - if err := env.Close(); err != nil { - t.Error(err) - } - }() + defer env.Close() fd, err := env.FD() if err != nil && !strings.Contains(err.Error(), "operation not permitted") { @@ -109,11 +93,7 @@ func TestEnv_FD(t *testing.T) { } // open an environment at a temporary path. - path, err := ioutil.TempDir("", "mdb_test") - if err != nil { - t.Fatalf("tempdir: %v", err) - } - defer os.RemoveAll(path) + path := t.TempDir() err = env.Open(path, 0, 0664) if err != nil { t.Errorf("open: %s", err) @@ -130,7 +110,6 @@ func TestEnv_FD(t *testing.T) { func TestEnv_Flags(t *testing.T) { env := setup(t) - defer clean(env, t) flags, err := env.Flags() if err != nil { @@ -173,25 +152,21 @@ func TestEnv_Flags(t *testing.T) { } func TestEnv_SetMaxReader(t *testing.T) { - dir, err := ioutil.TempDir("", "test-env-setmaxreaders-") - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(dir) + dir := t.TempDir() env, err := NewEnv() if err != nil { t.Error(err) } - maxreaders := 246 - err = env.SetMaxReaders(maxreaders) + maxreaders := uint64(246) + err = env.SetOption(OptMaxReaders, maxreaders) if err != nil { - t.Error(err) + t.Fatal(err) } - _maxreaders, err := env.MaxReaders() + _maxreaders, err := env.GetOption(OptMaxReaders) if err != nil { - t.Error(err) + t.Fatal(err) } if _maxreaders < maxreaders { t.Errorf("unexpected MaxReaders: %v (< %v)", _maxreaders, maxreaders) @@ -203,27 +178,21 @@ func TestEnv_SetMaxReader(t *testing.T) { env.Close() t.Error(err) } - - err = env.SetMaxReaders(126) - if !IsErrnoSys(err, syscall.EPERM) { - t.Errorf("unexpected error: %v (!= %v)", err, syscall.EPERM) - } - _maxreaders, err = env.MaxReaders() - if err != nil { - t.Error(err) - } - if _maxreaders < maxreaders { - t.Errorf("unexpected MaxReaders: %v (!= %v)", _maxreaders, maxreaders) - } + // + //err = env.SetOption(OptMaxReaders, uint64(126)) + //if !IsErrnoSys(err, syscall.EPERM) { + // t.Errorf("unexpected error: %v (!= %v)", err, syscall.EPERM) + //} + //_maxreaders, err = env.GetOption(OptMaxReaders) + //if err != nil { + // t.Error(err) + //} + //if _maxreaders < maxreaders { + // t.Errorf("unexpected MaxReaders: %v (!= %v)", _maxreaders, maxreaders) + //} } func TestEnv_SetDebug(t *testing.T) { - dir, err := ioutil.TempDir("", "test-env-setmdebug-") - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(dir) - env, err := NewEnv() if err != nil { t.Error(err) @@ -237,7 +206,7 @@ func TestEnv_SetDebug(t *testing.T) { //func TestEnv_SetMapSize(t *testing.T) { // env := setup(t) -// defer clean(env, t) +// // // const minsize = 100 << 20 // 100MB // err := env.SetMapSize(minsize) @@ -262,7 +231,7 @@ func TestEnv_SetDebug(t *testing.T) { //func TestEnv_ReaderList(t *testing.T) { // env := setup(t) -// defer clean(env, t) +// // // var numreaders = 2 // @@ -308,7 +277,7 @@ func TestEnv_SetDebug(t *testing.T) { //func TestEnv_ReaderList_error(t *testing.T) { // env := setup(t) -// defer clean(env, t) +// // // var numreaders = 2 // @@ -381,7 +350,6 @@ func TestEnv_SetDebug(t *testing.T) { func TestEnv_ReaderCheck(t *testing.T) { env := setup(t) - defer clean(env, t) numDead, err := env.ReaderCheck() if err != nil { @@ -436,7 +404,7 @@ func TestEnv_ReaderCheck(t *testing.T) { // } // // env := setup(t) -// defer clean(env, t) +// // // item := struct{ k, v []byte }{ // []byte("k0"), @@ -500,7 +468,6 @@ func TestEnv_ReaderCheck(t *testing.T) { func TestEnv_Sync(t *testing.T) { env := setupFlags(t, SafeNoSync) - defer clean(env, t) item := struct{ k, v []byte }{[]byte("k0"), []byte("v0")} @@ -521,37 +488,32 @@ func TestEnv_Sync(t *testing.T) { } } -func setup(t T) *Env { +func setup(t testing.TB) *Env { return setupFlags(t, 0) } -func setupFlags(t T, flags uint) *Env { +func setupFlags(t testing.TB, flags uint) *Env { env, err := NewEnv() if err != nil { t.Fatalf("env: %s", err) } - path, err := ioutil.TempDir("", "mdb_test") - if err != nil { - t.Fatalf("tempdir: %v", err) - } - err = os.MkdirAll(path, 0770) - if err != nil { - t.Fatalf("mkdir: %s", path) - } - err = env.SetMaxDBs(1024) + path := t.TempDir() + err = env.SetOption(OptMaxDB, 1024) if err != nil { t.Fatalf("setmaxdbs: %v", err) } - err = env.SetGeometry(-1, -1, 10*1024*1024, -1, -1, 4096) + const pageSize = 4096 + err = env.SetGeometry(-1, -1, 64*1024*pageSize, -1, -1, pageSize) if err != nil { t.Fatalf("setmaxdbs: %v", err) } - flags |= UtterlyNoSync | NoMetaSync err = env.Open(path, flags, 0664) if err != nil { t.Fatalf("open: %s", err) } - + t.Cleanup(func() { + env.Close() + }) return env } @@ -560,31 +522,8 @@ type T interface { Fatalf(format string, vals ...interface{}) } -func clean(env *Env, t T) { - path, err := env.Path() - if err != nil { - t.Errorf("path: %v", err) - } - err = env.Close() - if err != nil { - t.Errorf("close: %s", err) - } - if path != "" { - err = os.RemoveAll(path) - if err != nil { - t.Errorf("remove: %v", err) - } - } -} - -func TestEnvCopy(t *testing.T) { - env := setup(t) - defer clean(env, t) -} - func TestEnv_MaxKeySize(t *testing.T) { env := setup(t) - defer clean(env, t) n := env.MaxKeySize() if n <= 0 { @@ -603,7 +542,6 @@ func TestEnv_MaxKeySize_nil(t *testing.T) { func TestEnv_CloseDBI(t *testing.T) { env := setup(t) - defer clean(env, t) const numdb = 1000 for i := 0; i < numdb; i++ { diff --git a/mdbx/error_unix.go b/mdbx/error_unix.go index 5a4edb2..71c57b1 100644 --- a/mdbx/error_unix.go +++ b/mdbx/error_unix.go @@ -4,7 +4,6 @@ package mdbx /* #include "mdbxgo.h" -#include "dist/mdbx.h" */ import "C" import ( diff --git a/mdbx/error_windows.go b/mdbx/error_windows.go index 3448a32..2955565 100644 --- a/mdbx/error_windows.go +++ b/mdbx/error_windows.go @@ -3,7 +3,6 @@ package mdbx /* #include #include "mdbxgo.h" -#include "dist/mdbx.h" */ import "C" import "syscall" diff --git a/mdbx/internal/arch/width.go b/mdbx/internal/arch/width.go deleted file mode 100644 index aa2c22d..0000000 --- a/mdbx/internal/arch/width.go +++ /dev/null @@ -1,7 +0,0 @@ -// Package lmdbarch contains some architecture detection constants. The -// primary reason the package exists is because the constant definitions are -// scary and some will not pass linters. -package arch - -// Width64 is 1 for 64-bit architectures and 0 otherwise. -const Width64 = 1 << (^uintptr(0) >> 63) / 2 diff --git a/mdbx/mdbx.c b/mdbx/mdbx.c new file mode 100644 index 0000000..06af057 --- /dev/null +++ b/mdbx/mdbx.c @@ -0,0 +1,29920 @@ +/* + * Copyright 2015-2021 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . */ + +#define xMDBX_ALLOY 1 +#define MDBX_BUILD_SOURCERY 220eee3b3a4b48cb20897d772c6665de65226e5811d687b6f7f43b2beeb9cb31_v0_10_1_15_g63e7276c +#ifdef MDBX_CONFIG_H +#include MDBX_CONFIG_H +#endif + +#define LIBMDBX_INTERNALS +#ifdef xMDBX_TOOLS +#define MDBX_DEPRECATED +#endif /* xMDBX_TOOLS */ + +#ifdef xMDBX_ALLOY +/* Amalgamated build */ +#define MDBX_INTERNAL_FUNC static +#define MDBX_INTERNAL_VAR static +#else +/* Non-amalgamated build */ +#define MDBX_INTERNAL_FUNC +#define MDBX_INTERNAL_VAR extern +#endif /* xMDBX_ALLOY */ + +/** Disables using GNU/Linux libc extensions. + * \ingroup build_option + * \note This option couldn't be moved to the options.h since dependant + * control macros/defined should be prepared before include the options.h */ +#ifndef MDBX_DISABLE_GNU_SOURCE +#define MDBX_DISABLE_GNU_SOURCE 0 +#endif +#if MDBX_DISABLE_GNU_SOURCE +#undef _GNU_SOURCE +#elif (defined(__linux__) || defined(__gnu_linux__)) && !defined(_GNU_SOURCE) +#define _GNU_SOURCE +#endif /* MDBX_DISABLE_GNU_SOURCE */ + +/*----------------------------------------------------------------------------*/ + +/* Should be defined before any includes */ +#ifndef _FILE_OFFSET_BITS +#define _FILE_OFFSET_BITS 64 +#endif + +#ifdef __APPLE__ +#define _DARWIN_C_SOURCE +#endif + +#ifdef _MSC_VER +#if _MSC_FULL_VER < 190024234 +/* Actually libmdbx was not tested with compilers older than 19.00.24234 (Visual + * Studio 2015 Update 3). But you could remove this #error and try to continue + * at your own risk. In such case please don't rise up an issues related ONLY to + * old compilers. + */ +#error \ + "At least \"Microsoft C/C++ Compiler\" version 19.00.24234 (Visual Studio 2015 Update 3) is required." +#endif +#ifndef _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_WARNINGS +#endif +#if _MSC_VER > 1800 +#pragma warning(disable : 4464) /* relative include path contains '..' */ +#endif +#if _MSC_VER > 1913 +#pragma warning(disable : 5045) /* Compiler will insert Spectre mitigation... \ + */ +#endif +#pragma warning(disable : 4710) /* 'xyz': function not inlined */ +#pragma warning(disable : 4711) /* function 'xyz' selected for automatic \ + inline expansion */ +#pragma warning( \ + disable : 4201) /* nonstandard extension used : nameless struct / union */ +#pragma warning(disable : 4702) /* unreachable code */ +#pragma warning(disable : 4706) /* assignment within conditional expression */ +#pragma warning(disable : 4127) /* conditional expression is constant */ +#pragma warning(disable : 4324) /* 'xyz': structure was padded due to \ + alignment specifier */ +#pragma warning(disable : 4310) /* cast truncates constant value */ +#pragma warning( \ + disable : 4820) /* bytes padding added after data member for alignment */ +#pragma warning(disable : 4548) /* expression before comma has no effect; \ + expected expression with side - effect */ +#pragma warning(disable : 4366) /* the result of the unary '&' operator may be \ + unaligned */ +#pragma warning(disable : 4200) /* nonstandard extension used: zero-sized \ + array in struct/union */ +#pragma warning(disable : 4204) /* nonstandard extension used: non-constant \ + aggregate initializer */ +#pragma warning( \ + disable : 4505) /* unreferenced local function has been removed */ +#endif /* _MSC_VER (warnings) */ + +#include "mdbx.h" +/* + * Copyright 2015-2021 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +/* *INDENT-OFF* */ +/* clang-format off */ + +#ifndef __GNUC_PREREQ +# if defined(__GNUC__) && defined(__GNUC_MINOR__) +# define __GNUC_PREREQ(maj, min) \ + ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) +# else +# define __GNUC_PREREQ(maj, min) (0) +# endif +#endif /* __GNUC_PREREQ */ + +#ifndef __CLANG_PREREQ +# ifdef __clang__ +# define __CLANG_PREREQ(maj,min) \ + ((__clang_major__ << 16) + __clang_minor__ >= ((maj) << 16) + (min)) +# else +# define __CLANG_PREREQ(maj,min) (0) +# endif +#endif /* __CLANG_PREREQ */ + +#ifndef __GLIBC_PREREQ +# if defined(__GLIBC__) && defined(__GLIBC_MINOR__) +# define __GLIBC_PREREQ(maj, min) \ + ((__GLIBC__ << 16) + __GLIBC_MINOR__ >= ((maj) << 16) + (min)) +# else +# define __GLIBC_PREREQ(maj, min) (0) +# endif +#endif /* __GLIBC_PREREQ */ + +#ifndef __has_warning +# define __has_warning(x) (0) +#endif + +#ifndef __has_include +# define __has_include(x) (0) +#endif + +#if __has_feature(thread_sanitizer) +# define __SANITIZE_THREAD__ 1 +#endif + +#if __has_feature(address_sanitizer) +# define __SANITIZE_ADDRESS__ 1 +#endif + +/*----------------------------------------------------------------------------*/ + +#ifndef __extern_C +# ifdef __cplusplus +# define __extern_C extern "C" +# else +# define __extern_C +# endif +#endif /* __extern_C */ + +#if !defined(nullptr) && !defined(__cplusplus) || (__cplusplus < 201103L && !defined(_MSC_VER)) +# define nullptr NULL +#endif + +/*----------------------------------------------------------------------------*/ + +#ifndef __always_inline +# if defined(__GNUC__) || __has_attribute(__always_inline__) +# define __always_inline __inline __attribute__((__always_inline__)) +# elif defined(_MSC_VER) +# define __always_inline __forceinline +# else +# define __always_inline +# endif +#endif /* __always_inline */ + +#ifndef __noinline +# if defined(__GNUC__) || __has_attribute(__noinline__) +# define __noinline __attribute__((__noinline__)) +# elif defined(_MSC_VER) +# define __noinline __declspec(noinline) +# else +# define __noinline +# endif +#endif /* __noinline */ + +#ifndef __must_check_result +# if defined(__GNUC__) || __has_attribute(__warn_unused_result__) +# define __must_check_result __attribute__((__warn_unused_result__)) +# else +# define __must_check_result +# endif +#endif /* __must_check_result */ + +#if !defined(__noop) && !defined(_MSC_VER) +# define __noop(...) do {} while(0) +#endif /* __noop */ + +#ifndef __fallthrough +# if defined(__cplusplus) && (__has_cpp_attribute(fallthrough) && \ + (!defined(__clang__) || __clang__ > 4)) || __cplusplus >= 201703L +# define __fallthrough [[fallthrough]] +# elif __GNUC_PREREQ(8, 0) && defined(__cplusplus) && __cplusplus >= 201103L +# define __fallthrough [[fallthrough]] +# elif __GNUC_PREREQ(7, 0) && \ + (!defined(__LCC__) || (__LCC__ == 124 && __LCC_MINOR__ >= 12) || \ + (__LCC__ == 125 && __LCC_MINOR__ >= 5) || (__LCC__ >= 126)) +# define __fallthrough __attribute__((__fallthrough__)) +# elif defined(__clang__) && defined(__cplusplus) && __cplusplus >= 201103L &&\ + __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough") +# define __fallthrough [[clang::fallthrough]] +# else +# define __fallthrough +# endif +#endif /* __fallthrough */ + +#ifndef __unreachable +# if __GNUC_PREREQ(4,5) || __has_builtin(__builtin_unreachable) +# define __unreachable() __builtin_unreachable() +# elif defined(_MSC_VER) +# define __unreachable() __assume(0) +# else +# define __unreachable() __noop() +# endif +#endif /* __unreachable */ + +#ifndef __prefetch +# if defined(__GNUC__) || defined(__clang__) || __has_builtin(__builtin_prefetch) +# define __prefetch(ptr) __builtin_prefetch(ptr) +# else +# define __prefetch(ptr) __noop(ptr) +# endif +#endif /* __prefetch */ + +#ifndef __nothrow +# if defined(__cplusplus) +# if __cplusplus < 201703L +# define __nothrow throw() +# else +# define __nothrow noexcept(true) +# endif /* __cplusplus */ +# elif defined(__GNUC__) || __has_attribute(__nothrow__) +# define __nothrow __attribute__((__nothrow__)) +# elif defined(_MSC_VER) && defined(__cplusplus) +# define __nothrow __declspec(nothrow) +# else +# define __nothrow +# endif +#endif /* __nothrow */ + +#ifndef __hidden +# if defined(__GNUC__) || __has_attribute(__visibility__) +# define __hidden __attribute__((__visibility__("hidden"))) +# else +# define __hidden +# endif +#endif /* __hidden */ + +#ifndef __optimize +# if defined(__OPTIMIZE__) +# if (defined(__GNUC__) && !defined(__clang__)) || __has_attribute(__optimize__) +# define __optimize(ops) __attribute__((__optimize__(ops))) +# else +# define __optimize(ops) +# endif +# else +# define __optimize(ops) +# endif +#endif /* __optimize */ + +#ifndef __hot +# if defined(__OPTIMIZE__) +# if defined(__e2k__) +# define __hot __attribute__((__hot__)) __optimize(3) +# elif defined(__clang__) && !__has_attribute(__hot_) \ + && __has_attribute(__section__) && (defined(__linux__) || defined(__gnu_linux__)) + /* just put frequently used functions in separate section */ +# define __hot __attribute__((__section__("text.hot"))) __optimize("O3") +# elif defined(__GNUC__) || __has_attribute(__hot__) +# define __hot __attribute__((__hot__)) __optimize("O3") +# else +# define __hot __optimize("O3") +# endif +# else +# define __hot +# endif +#endif /* __hot */ + +#ifndef __cold +# if defined(__OPTIMIZE__) +# if defined(__e2k__) +# define __cold __attribute__((__cold__)) __optimize(1) +# elif defined(__clang__) && !__has_attribute(cold) \ + && __has_attribute(__section__) && (defined(__linux__) || defined(__gnu_linux__)) + /* just put infrequently used functions in separate section */ +# define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os") +# elif defined(__GNUC__) || __has_attribute(cold) +# define __cold __attribute__((__cold__)) __optimize("Os") +# else +# define __cold __optimize("Os") +# endif +# else +# define __cold +# endif +#endif /* __cold */ + +#ifndef __flatten +# if defined(__OPTIMIZE__) && (defined(__GNUC__) || __has_attribute(__flatten__)) +# define __flatten __attribute__((__flatten__)) +# else +# define __flatten +# endif +#endif /* __flatten */ + +#ifndef likely +# if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__) +# define likely(cond) __builtin_expect(!!(cond), 1) +# else +# define likely(x) (!!(x)) +# endif +#endif /* likely */ + +#ifndef unlikely +# if (defined(__GNUC__) || __has_builtin(__builtin_expect)) && !defined(__COVERITY__) +# define unlikely(cond) __builtin_expect(!!(cond), 0) +# else +# define unlikely(x) (!!(x)) +# endif +#endif /* unlikely */ + +#ifndef __anonymous_struct_extension__ +# if defined(__GNUC__) +# define __anonymous_struct_extension__ __extension__ +# else +# define __anonymous_struct_extension__ +# endif +#endif /* __anonymous_struct_extension__ */ + +#ifndef __Wpedantic_format_voidptr + MDBX_MAYBE_UNUSED MDBX_PURE_FUNCTION static __inline const void* + __Wpedantic_format_voidptr(const void* ptr) {return ptr;} +# define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG) +#endif /* __Wpedantic_format_voidptr */ + +/*----------------------------------------------------------------------------*/ + +#if defined(MDBX_USE_VALGRIND) +# include +# ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE + /* LY: available since Valgrind 3.10 */ +# define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) +# define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) +# endif +#elif !defined(RUNNING_ON_VALGRIND) +# define VALGRIND_CREATE_MEMPOOL(h,r,z) +# define VALGRIND_DESTROY_MEMPOOL(h) +# define VALGRIND_MEMPOOL_TRIM(h,a,s) +# define VALGRIND_MEMPOOL_ALLOC(h,a,s) +# define VALGRIND_MEMPOOL_FREE(h,a) +# define VALGRIND_MEMPOOL_CHANGE(h,a,b,s) +# define VALGRIND_MAKE_MEM_NOACCESS(a,s) +# define VALGRIND_MAKE_MEM_DEFINED(a,s) +# define VALGRIND_MAKE_MEM_UNDEFINED(a,s) +# define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) +# define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) +# define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a,s) (0) +# define VALGRIND_CHECK_MEM_IS_DEFINED(a,s) (0) +# define RUNNING_ON_VALGRIND (0) +#endif /* MDBX_USE_VALGRIND */ + +#ifdef __SANITIZE_ADDRESS__ +# include +#elif !defined(ASAN_POISON_MEMORY_REGION) +# define ASAN_POISON_MEMORY_REGION(addr, size) \ + ((void)(addr), (void)(size)) +# define ASAN_UNPOISON_MEMORY_REGION(addr, size) \ + ((void)(addr), (void)(size)) +#endif /* __SANITIZE_ADDRESS__ */ + +/*----------------------------------------------------------------------------*/ + +#ifndef ARRAY_LENGTH +# ifdef __cplusplus + template + char (&__ArraySizeHelper(T (&array)[N]))[N]; +# define ARRAY_LENGTH(array) (sizeof(::__ArraySizeHelper(array))) +# else +# define ARRAY_LENGTH(array) (sizeof(array) / sizeof(array[0])) +# endif +#endif /* ARRAY_LENGTH */ + +#ifndef ARRAY_END +# define ARRAY_END(array) (&array[ARRAY_LENGTH(array)]) +#endif /* ARRAY_END */ + +#ifndef STRINGIFY +# define STRINGIFY_HELPER(x) #x +# define STRINGIFY(x) STRINGIFY_HELPER(x) +#endif /* STRINGIFY */ + +#define CONCAT(a,b) a##b +#define XCONCAT(a,b) CONCAT(a,b) + +#ifndef offsetof +# define offsetof(type, member) __builtin_offsetof(type, member) +#endif /* offsetof */ + +#ifndef container_of +# define container_of(ptr, type, member) \ + ((type *)((char *)(ptr) - offsetof(type, member))) +#endif /* container_of */ + +#define MDBX_TETRAD(a, b, c, d) \ + ((uint32_t)(a) << 24 | (uint32_t)(b) << 16 | (uint32_t)(c) << 8 | (d)) + +#define MDBX_STRING_TETRAD(str) MDBX_TETRAD(str[0], str[1], str[2], str[3]) + +#define FIXME "FIXME: " __FILE__ ", " STRINGIFY(__LINE__) + +#ifndef STATIC_ASSERT_MSG +# if defined(static_assert) +# define STATIC_ASSERT_MSG(expr, msg) static_assert(expr, msg) +# elif defined(_STATIC_ASSERT) +# define STATIC_ASSERT_MSG(expr, msg) _STATIC_ASSERT(expr) +# elif defined(_MSC_VER) +# include +# define STATIC_ASSERT_MSG(expr, msg) _STATIC_ASSERT(expr) +# elif (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) \ + || __has_feature(c_static_assert) +# define STATIC_ASSERT_MSG(expr, msg) _Static_assert(expr, msg) +# else +# define STATIC_ASSERT_MSG(expr, msg) switch (0) {case 0:case (expr):;} +# endif +#endif /* STATIC_ASSERT */ + +#ifndef STATIC_ASSERT +# define STATIC_ASSERT(expr) STATIC_ASSERT_MSG(expr, #expr) +#endif + +/* *INDENT-ON* */ +/* clang-format on */ + +#if defined(__GNUC__) && !__GNUC_PREREQ(4, 2) +/* Actually libmdbx was not tested with compilers older than GCC 4.2. + * But you could ignore this warning at your own risk. + * In such case please don't rise up an issues related ONLY to old compilers. + */ +#warning "libmdbx required GCC >= 4.2" +#endif + +#if defined(__clang__) && !__CLANG_PREREQ(3, 8) +/* Actually libmdbx was not tested with CLANG older than 3.8. + * But you could ignore this warning at your own risk. + * In such case please don't rise up an issues related ONLY to old compilers. + */ +#warning "libmdbx required CLANG >= 3.8" +#endif + +#if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12) +/* Actually libmdbx was not tested with something older than glibc 2.12. + * But you could ignore this warning at your own risk. + * In such case please don't rise up an issues related ONLY to old systems. + */ +#warning "libmdbx was only tested with GLIBC >= 2.12." +#endif + +#ifdef __SANITIZE_THREAD__ +#warning \ + "libmdbx don't compatible with ThreadSanitizer, you will get a lot of false-positive issues." +#endif /* __SANITIZE_THREAD__ */ + +#if __has_warning("-Wnested-anon-types") +#if defined(__clang__) +#pragma clang diagnostic ignored "-Wnested-anon-types" +#elif defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wnested-anon-types" +#else +#pragma warning disable "nested-anon-types" +#endif +#endif /* -Wnested-anon-types */ + +#if __has_warning("-Wconstant-logical-operand") +#if defined(__clang__) +#pragma clang diagnostic ignored "-Wconstant-logical-operand" +#elif defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wconstant-logical-operand" +#else +#pragma warning disable "constant-logical-operand" +#endif +#endif /* -Wconstant-logical-operand */ + +#if defined(__LCC__) && (__LCC__ <= 121) +/* bug #2798 */ +#pragma diag_suppress alignment_reduction_ignored +#elif defined(__ICC) +#pragma warning(disable : 3453 1366) +#elif __has_warning("-Walignment-reduction-ignored") +#if defined(__clang__) +#pragma clang diagnostic ignored "-Walignment-reduction-ignored" +#elif defined(__GNUC__) +#pragma GCC diagnostic ignored "-Walignment-reduction-ignored" +#else +#pragma warning disable "alignment-reduction-ignored" +#endif +#endif /* -Walignment-reduction-ignored */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ + +/* + * Copyright 2015-2021 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + + +/*----------------------------------------------------------------------------*/ +/* Microsoft compiler generates a lot of warning for self includes... */ + +#ifdef _MSC_VER +#pragma warning(push, 1) +#pragma warning(disable : 4548) /* expression before comma has no effect; \ + expected expression with side - effect */ +#pragma warning(disable : 4530) /* C++ exception handler used, but unwind \ + * semantics are not enabled. Specify /EHsc */ +#pragma warning(disable : 4577) /* 'noexcept' used with no exception handling \ + * mode specified; termination on exception is \ + * not guaranteed. Specify /EHsc */ +#endif /* _MSC_VER (warnings) */ + +#if defined(_WIN32) || defined(_WIN64) +#if !defined(_CRT_SECURE_NO_WARNINGS) +#define _CRT_SECURE_NO_WARNINGS +#endif +#if !defined(_NO_CRT_STDIO_INLINE) && MDBX_BUILD_SHARED_LIBRARY && \ + !defined(xMDBX_TOOLS) && MDBX_WITHOUT_MSVC_CRT +#define _NO_CRT_STDIO_INLINE +#endif +#elif !defined(_POSIX_C_SOURCE) +#define _POSIX_C_SOURCE 200809L +#endif /* Windows */ + +/*----------------------------------------------------------------------------*/ +/* C99 includes */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* C11 stdalign.h */ +#if __has_include() +#include +#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L +#define alignas(N) _Alignas(N) +#elif defined(_MSC_VER) +#define alignas(N) __declspec(align(N)) +#elif __has_attribute(__aligned__) || defined(__GNUC__) +#define alignas(N) __attribute__((__aligned__(N))) +#else +#error "FIXME: Required _alignas() or equivalent." +#endif + +/*----------------------------------------------------------------------------*/ +/* Systems includes */ + +#ifdef __APPLE__ +#include +#endif /* Apple OSX & iOS */ + +#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ + defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) || \ + defined(__APPLE__) || defined(__MACH__) +#include +#include +#include +#include +#if defined(__FreeBSD__) || defined(__DragonFly__) +#include +#elif defined(__OpenBSD__) || defined(__NetBSD__) +#include +#else +#define SYSCTL_LEGACY_NONCONST_MIB +#endif +#ifndef __MACH__ +#include +#endif +#else +#include +#if !(defined(__sun) || defined(__SVR4) || defined(__svr4__) || \ + defined(_WIN32) || defined(_WIN64)) +#include +#endif /* !Solaris */ +#endif /* !xBSD */ + +#if defined(__FreeBSD__) || __has_include() +#include +#endif + +#if defined(__APPLE__) || defined(__MACH__) || __has_include() +#include +#endif /* MacOS */ + +#if defined(__MACH__) +#include +#include +#include +#include +#undef P_DIRTY +#endif + +#if defined(__linux__) || defined(__gnu_linux__) +#include +#include +#include +#include +#endif /* Linux */ + +#ifndef _XOPEN_SOURCE +#define _XOPEN_SOURCE 0 +#endif + +#ifndef _XOPEN_SOURCE_EXTENDED +#define _XOPEN_SOURCE_EXTENDED 0 +#else +#include +#endif /* _XOPEN_SOURCE_EXTENDED */ + +#if defined(__sun) || defined(__SVR4) || defined(__svr4__) +#include +#include +/* On Solaris, it's easier to add a missing prototype rather than find a + * combination of #defines that break nothing. */ +__extern_C key_t ftok(const char *, int); +#endif /* SunOS/Solaris */ + +#if defined(_WIN32) || defined(_WIN64) +#ifndef _WIN32_WINNT +#define _WIN32_WINNT 0x0601 /* Windows 7 */ +#elif _WIN32_WINNT < 0x0500 +#error At least 'Windows 2000' API is required for libmdbx. +#endif /* _WIN32_WINNT */ +#if (defined(__MINGW32__) || defined(__MINGW64__)) && \ + !defined(__USE_MINGW_ANSI_STDIO) +#define __USE_MINGW_ANSI_STDIO 1 +#endif /* MinGW */ +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif /* WIN32_LEAN_AND_MEAN */ +#include +#include +#include +#include +#include +#define HAVE_SYS_STAT_H +#define HAVE_SYS_TYPES_H +typedef HANDLE mdbx_thread_t; +typedef unsigned mdbx_thread_key_t; +#define MAP_FAILED NULL +#define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0)) +#define THREAD_CALL WINAPI +#define THREAD_RESULT DWORD +typedef struct { + HANDLE mutex; + HANDLE event[2]; +} mdbx_condpair_t; +typedef CRITICAL_SECTION mdbx_fastmutex_t; + +#if !defined(_MSC_VER) && !defined(__try) +/* *INDENT-OFF* */ +/* clang-format off */ +#define __try +#define __except(COND) if(false) +/* *INDENT-ON* */ +/* clang-format on */ +#endif /* stub for MSVC's __try/__except */ + +#if MDBX_WITHOUT_MSVC_CRT + +#ifndef mdbx_malloc +static inline void *mdbx_malloc(size_t bytes) { + return HeapAlloc(GetProcessHeap(), 0, bytes); +} +#endif /* mdbx_malloc */ + +#ifndef mdbx_calloc +static inline void *mdbx_calloc(size_t nelem, size_t size) { + return HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, nelem * size); +} +#endif /* mdbx_calloc */ + +#ifndef mdbx_realloc +static inline void *mdbx_realloc(void *ptr, size_t bytes) { + return ptr ? HeapReAlloc(GetProcessHeap(), 0, ptr, bytes) + : HeapAlloc(GetProcessHeap(), 0, bytes); +} +#endif /* mdbx_realloc */ + +#ifndef mdbx_free +static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } +#endif /* mdbx_free */ + +#else /* MDBX_WITHOUT_MSVC_CRT */ + +#define mdbx_malloc malloc +#define mdbx_calloc calloc +#define mdbx_realloc realloc +#define mdbx_free free +#define mdbx_strdup _strdup + +#endif /* MDBX_WITHOUT_MSVC_CRT */ + +#ifndef snprintf +#define snprintf _snprintf /* ntdll */ +#endif + +#ifndef vsnprintf +#define vsnprintf _vsnprintf /* ntdll */ +#endif + +#else /*----------------------------------------------------------------------*/ + +#include +#if !defined(_POSIX_MAPPED_FILES) || _POSIX_MAPPED_FILES < 1 +#error "libmdbx requires the _POSIX_MAPPED_FILES feature" +#endif /* _POSIX_MAPPED_FILES */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +typedef pthread_t mdbx_thread_t; +typedef pthread_key_t mdbx_thread_key_t; +#define INVALID_HANDLE_VALUE (-1) +#define THREAD_CALL +#define THREAD_RESULT void * +typedef struct { + pthread_mutex_t mutex; + pthread_cond_t cond[2]; +} mdbx_condpair_t; +typedef pthread_mutex_t mdbx_fastmutex_t; +#define mdbx_malloc malloc +#define mdbx_calloc calloc +#define mdbx_realloc realloc +#define mdbx_free free +#define mdbx_strdup strdup +#endif /* Platform */ + +#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) +/* malloc_usable_size() already provided */ +#elif defined(__APPLE__) +#define malloc_usable_size(ptr) malloc_size(ptr) +#elif defined(_MSC_VER) && !MDBX_WITHOUT_MSVC_CRT +#define malloc_usable_size(ptr) _msize(ptr) +#endif /* malloc_usable_size */ + +#ifdef __ANDROID_API__ +#include +#if __ANDROID_API__ >= 21 +#include +#endif +#endif /* Android */ + +/* *INDENT-OFF* */ +/* clang-format off */ +#if defined(HAVE_SYS_STAT_H) || __has_include() +#include +#endif +#if defined(HAVE_SYS_TYPES_H) || __has_include() +#include +#endif +#if defined(HAVE_SYS_FILE_H) || __has_include() +#include +#endif +/* *INDENT-ON* */ +/* clang-format on */ + +#ifndef SSIZE_MAX +#define SSIZE_MAX INTPTR_MAX +#endif + +#if !defined(MADV_DODUMP) && defined(MADV_CORE) +#define MADV_DODUMP MADV_CORE +#endif /* MADV_CORE -> MADV_DODUMP */ + +#if !defined(MADV_DONTDUMP) && defined(MADV_NOCORE) +#define MADV_DONTDUMP MADV_NOCORE +#endif /* MADV_NOCORE -> MADV_DONTDUMP */ + +#if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \ + defined(i486) || defined(__i486) || defined(__i486__) || \ + defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \ + defined(__i686) || defined(__i686__) || defined(_M_IX86) || \ + defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) || \ + defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) || \ + defined(__amd64__) || defined(__amd64) || defined(_M_X64) || \ + defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__) +#ifndef __ia32__ +/* LY: define neutral __ia32__ for x86 and x86-64 */ +#define __ia32__ 1 +#endif /* __ia32__ */ +#if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ + defined(__amd64) || defined(_M_X64)) +/* LY: define trusty __amd64__ for all AMD64/x86-64 arch */ +#define __amd64__ 1 +#endif /* __amd64__ */ +#endif /* all x86 */ + +#if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF +#error \ + "Sanity checking failed: Two's complement, reasonably sized integer types" +#endif + +#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul +#define MDBX_WORDBITS 64 +#else +#define MDBX_WORDBITS 32 +#endif /* MDBX_WORDBITS */ + +/*----------------------------------------------------------------------------*/ +/* Compiler's includes for builtins/intrinsics */ + +#if defined(_MSC_VER) || defined(__INTEL_COMPILER) +#include +#elif __GNUC_PREREQ(4, 4) || defined(__clang__) +#if defined(__ia32__) || defined(__e2k__) +#include +#endif /* __ia32__ */ +#if defined(__ia32__) +#include +#endif /* __ia32__ */ +#elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) +#include +#elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ + (defined(HP_IA64) || defined(__ia64)) +#include +#elif defined(__IBMC__) && defined(__powerpc) +#include +#elif defined(_AIX) +#include +#include +#elif (defined(__osf__) && defined(__DECC)) || defined(__alpha) +#include +#include +#elif defined(__MWERKS__) +/* CodeWarrior - troubles ? */ +#pragma gcc_extensions +#elif defined(__SNC__) +/* Sony PS3 - troubles ? */ +#elif defined(__hppa__) || defined(__hppa) +#include +#else +#error Unsupported C compiler, please use GNU C 4.4 or newer +#endif /* Compiler */ + +/*----------------------------------------------------------------------------*/ +/* Byteorder */ + +#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \ + !defined(__ORDER_BIG_ENDIAN__) + +/* *INDENT-OFF* */ +/* clang-format off */ +#if defined(__GLIBC__) || defined(__GNU_LIBRARY__) || defined(__ANDROID_API__) || \ + defined(HAVE_ENDIAN_H) || __has_include() +#include +#elif defined(__APPLE__) || defined(__MACH__) || defined(__OpenBSD__) || \ + defined(HAVE_MACHINE_ENDIAN_H) || __has_include() +#include +#elif defined(HAVE_SYS_ISA_DEFS_H) || __has_include() +#include +#elif (defined(HAVE_SYS_TYPES_H) && defined(HAVE_SYS_ENDIAN_H)) || \ + (__has_include() && __has_include()) +#include +#include +#elif defined(__bsdi__) || defined(__DragonFly__) || defined(__FreeBSD__) || \ + defined(__NetBSD__) || \ + defined(HAVE_SYS_PARAM_H) || __has_include() +#include +#endif /* OS */ +/* *INDENT-ON* */ +/* clang-format on */ + +#if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN) +#define __ORDER_LITTLE_ENDIAN__ __LITTLE_ENDIAN +#define __ORDER_BIG_ENDIAN__ __BIG_ENDIAN +#define __BYTE_ORDER__ __BYTE_ORDER +#elif defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN) +#define __ORDER_LITTLE_ENDIAN__ _LITTLE_ENDIAN +#define __ORDER_BIG_ENDIAN__ _BIG_ENDIAN +#define __BYTE_ORDER__ _BYTE_ORDER +#else +#define __ORDER_LITTLE_ENDIAN__ 1234 +#define __ORDER_BIG_ENDIAN__ 4321 + +#if defined(__LITTLE_ENDIAN__) || \ + (defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN)) || \ + defined(__ARMEL__) || defined(__THUMBEL__) || defined(__AARCH64EL__) || \ + defined(__MIPSEL__) || defined(_MIPSEL) || defined(__MIPSEL) || \ + defined(_M_ARM) || defined(_M_ARM64) || defined(__e2k__) || \ + defined(__elbrus_4c__) || defined(__elbrus_8c__) || defined(__bfin__) || \ + defined(__BFIN__) || defined(__ia64__) || defined(_IA64) || \ + defined(__IA64__) || defined(__ia64) || defined(_M_IA64) || \ + defined(__itanium__) || defined(__ia32__) || defined(__CYGWIN__) || \ + defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) || \ + defined(__WINDOWS__) +#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ + +#elif defined(__BIG_ENDIAN__) || \ + (defined(_BIG_ENDIAN) && !defined(_LITTLE_ENDIAN)) || \ + defined(__ARMEB__) || defined(__THUMBEB__) || defined(__AARCH64EB__) || \ + defined(__MIPSEB__) || defined(_MIPSEB) || defined(__MIPSEB) || \ + defined(__m68k__) || defined(M68000) || defined(__hppa__) || \ + defined(__hppa) || defined(__HPPA__) || defined(__sparc__) || \ + defined(__sparc) || defined(__370__) || defined(__THW_370__) || \ + defined(__s390__) || defined(__s390x__) || defined(__SYSC_ZARCH__) +#define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__ + +#else +#error __BYTE_ORDER__ should be defined. +#endif /* Arch */ + +#endif +#endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */ + +/* Get the size of a memory page for the system. + * This is the basic size that the platform's memory manager uses, and is + * fundamental to the use of memory-mapped files. */ +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t +mdbx_syspagesize(void) { +#if defined(_WIN32) || defined(_WIN64) + SYSTEM_INFO si; + GetSystemInfo(&si); + return si.dwPageSize; +#else + return sysconf(_SC_PAGE_SIZE); +#endif +} + +typedef struct mdbx_mmap_param { + union { + void *address; + uint8_t *dxb; + struct MDBX_lockinfo *lck; + }; + mdbx_filehandle_t fd; + size_t limit; /* mapping length, but NOT a size of file nor DB */ + size_t current; /* mapped region size, i.e. the size of file and DB */ +#if defined(_WIN32) || defined(_WIN64) + uint64_t filesize /* in-process cache of a file size */; + HANDLE section; /* memory-mapped section handle */ +#endif +} mdbx_mmap_t; + +typedef union bin128 { + __anonymous_struct_extension__ struct { uint64_t x, y; }; + __anonymous_struct_extension__ struct { uint32_t a, b, c, d; }; +} bin128_t; + +#if defined(_WIN32) || defined(_WIN64) +typedef union MDBX_srwlock { + struct { + long volatile readerCount; + long volatile writerCount; + }; + RTL_SRWLOCK native; +} MDBX_srwlock; +#endif /* Windows */ + +#ifdef __cplusplus +extern void mdbx_osal_jitter(bool tiny); +#else + +/*----------------------------------------------------------------------------*/ +/* Atomics */ + +#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && (__has_include() || __has_extension(cxx_atomic)) +#include +#define MDBX_HAVE_C11ATOMICS +#elif !defined(__cplusplus) && \ + (__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) && \ + !defined(__STDC_NO_ATOMICS__) && \ + (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ + !(defined(__GNUC__) || defined(__clang__))) +#include +#define MDBX_HAVE_C11ATOMICS +#elif defined(__GNUC__) || defined(__clang__) +#elif defined(_MSC_VER) +#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ +#pragma warning(disable : 4133) /* 'function': incompatible types - from \ + 'size_t' to 'LONGLONG' */ +#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ + 'std::size_t', possible loss of data */ +#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ + 'long', possible loss of data */ +#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) +#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) +#elif defined(__APPLE__) +#include +#else +#error FIXME atomic-ops +#endif + +/*----------------------------------------------------------------------------*/ +/* Memory/Compiler barriers, cache coherence */ + +#if __has_include() +#include +#elif defined(__mips) || defined(__mips__) || defined(__mips64) || \ + defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \ + defined(__MWERKS__) || defined(__sgi) +/* MIPS should have explicit cache control */ +#include +#endif + +MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { +#if defined(__clang__) || defined(__GNUC__) + __asm__ __volatile__("" ::: "memory"); +#elif defined(_MSC_VER) + _ReadWriteBarrier(); +#elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ + __memory_barrier(); +#elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) + __compiler_barrier(); +#elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ + (defined(HP_IA64) || defined(__ia64)) + _Asm_sched_fence(/* LY: no-arg meaning 'all expect ALU', e.g. 0x3D3D */); +#elif defined(_AIX) || defined(__ppc__) || defined(__powerpc__) || \ + defined(__ppc64__) || defined(__powerpc64__) + __fence(); +#else +#error "Could not guess the kind of compiler, please report to us." +#endif +} + +MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { +#ifdef MDBX_HAVE_C11ATOMICS + atomic_thread_fence(memory_order_seq_cst); +#elif defined(__ATOMIC_SEQ_CST) +#ifdef __clang__ + __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +#else + __atomic_thread_fence(__ATOMIC_SEQ_CST); +#endif +#elif defined(__clang__) || defined(__GNUC__) + __sync_synchronize(); +#elif defined(_WIN32) || defined(_WIN64) + MemoryBarrier(); +#elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ +#if defined(__ia32__) + _mm_mfence(); +#else + __mf(); +#endif +#elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) + __machine_rw_barrier(); +#elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ + (defined(HP_IA64) || defined(__ia64)) + _Asm_mf(); +#elif defined(_AIX) || defined(__ppc__) || defined(__powerpc__) || \ + defined(__ppc64__) || defined(__powerpc64__) + __lwsync(); +#else +#error "Could not guess the kind of compiler, please report to us." +#endif +} + +/*----------------------------------------------------------------------------*/ +/* libc compatibility stuff */ + +#if (!defined(__GLIBC__) && __GLIBC_PREREQ(2, 1)) && \ + (defined(_GNU_SOURCE) || defined(_BSD_SOURCE)) +#define mdbx_asprintf asprintf +#define mdbx_vasprintf vasprintf +#else +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC + MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...); +MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); +#endif + +/*----------------------------------------------------------------------------*/ +/* OS abstraction layer stuff */ + +/* max bytes to write in one call */ +#if defined(_WIN32) || defined(_WIN64) +#define MAX_WRITE UINT32_C(0x01000000) +#else +#define MAX_WRITE UINT32_C(0x3fff0000) +#endif + +#if defined(__linux__) || defined(__gnu_linux__) +MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version; +MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; +#endif /* Linux */ + +#ifndef mdbx_strdup +LIBMDBX_API char *mdbx_strdup(const char *str); +#endif + +MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { +#if defined(_WIN32) || defined(_WIN64) + DWORD rc = GetLastError(); +#else + int rc = errno; +#endif + return rc; +} + +#ifndef mdbx_memalign_alloc +MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes, + void **result); +#endif +#ifndef mdbx_memalign_free +MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr); +#endif + +MDBX_INTERNAL_FUNC int mdbx_condpair_init(mdbx_condpair_t *condpair); +MDBX_INTERNAL_FUNC int mdbx_condpair_lock(mdbx_condpair_t *condpair); +MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair); +MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair, + bool part); +MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, bool part); +MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair); + +MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex); + +MDBX_INTERNAL_FUNC int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, + int iovcnt, uint64_t offset, + size_t expected_written); +MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count, + uint64_t offset); +MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, + size_t count, uint64_t offset); +MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf, + size_t count); + +MDBX_INTERNAL_FUNC int +mdbx_thread_create(mdbx_thread_t *thread, + THREAD_RESULT(THREAD_CALL *start_routine)(void *), + void *arg); +MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread); + +enum mdbx_syncmode_bits { + MDBX_SYNC_NONE = 0, + MDBX_SYNC_DATA = 1, + MDBX_SYNC_SIZE = 2, + MDBX_SYNC_IODQ = 4 +}; + +MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, + const enum mdbx_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length); +MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos); +MDBX_INTERNAL_FUNC int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length); + +enum mdbx_openfile_purpose { + MDBX_OPEN_DXB_READ = 0, + MDBX_OPEN_DXB_LAZY = 1, + MDBX_OPEN_DXB_DSYNC = 2, + MDBX_OPEN_LCK = 3, + MDBX_OPEN_COPY = 4, + MDBX_OPEN_DELETE = 5 +}; + +MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, + const MDBX_env *env, const char *pathname, + mdbx_filehandle_t *fd, + mdbx_mode_t unix_mode_bits); +MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); +MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname); +MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait); + +#define MMAP_OPTION_TRUNCATE 1 +#define MMAP_OPTION_SEMAPHORE 2 +MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, + const size_t must, const size_t limit, + const unsigned options); +MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); +#define MDBX_MRESIZE_MAY_MOVE 0x00000100 +#define MDBX_MRESIZE_MAY_UNMAP 0x00000200 +MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, + size_t size, size_t limit); +#if defined(_WIN32) || defined(_WIN64) +typedef struct { + unsigned limit, count; + HANDLE handles[31]; +} mdbx_handle_array_t; +MDBX_INTERNAL_FUNC int +mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); +MDBX_INTERNAL_FUNC int +mdbx_resume_threads_after_remap(mdbx_handle_array_t *array); +#endif /* Windows */ +MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, + size_t length, + enum mdbx_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, + const char *pathname, int err); + +MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { + STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); +#if defined(_WIN32) || defined(_WIN64) + return GetCurrentProcessId(); +#else + return getpid(); +#endif +} + +MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { + mdbx_tid_t thunk; + STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); +#if defined(_WIN32) || defined(_WIN64) + thunk = GetCurrentThreadId(); +#else + thunk = pthread_self(); +#endif + return (uintptr_t)thunk; +} + +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny); +MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void); +MDBX_INTERNAL_FUNC uint64_t +mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); +MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime); + +MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); +/*----------------------------------------------------------------------------*/ +/* lck stuff */ + +/// \brief Initialization of synchronization primitives linked with MDBX_env +/// instance both in LCK-file and within the current process. +/// \param +/// global_uniqueness_flag = true - denotes that there are no other processes +/// working with DB and LCK-file. Thus the function MUST initialize +/// shared synchronization objects in memory-mapped LCK-file. +/// global_uniqueness_flag = false - denotes that at least one process is +/// already working with DB and LCK-file, including the case when DB +/// has already been opened in the current process. Thus the function +/// MUST NOT initialize shared synchronization objects in memory-mapped +/// LCK-file that are already in use. +/// \return Error code or zero on success. +MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, + MDBX_env *inprocess_neighbor, + int global_uniqueness_flag); + +/// \brief Disconnects from shared interprocess objects and destructs +/// synchronization objects linked with MDBX_env instance +/// within the current process. +/// \param +/// inprocess_neighbor = NULL - if the current process does not have other +/// instances of MDBX_env linked with the DB being closed. +/// Thus the function MUST check for other processes working with DB or +/// LCK-file, and keep or destroy shared synchronization objects in +/// memory-mapped LCK-file depending on the result. +/// inprocess_neighbor = not-NULL - pointer to another instance of MDBX_env +/// (anyone of there is several) working with DB or LCK-file within the +/// current process. Thus the function MUST NOT try to acquire exclusive +/// lock and/or try to destruct shared synchronization objects linked with +/// DB or LCK-file. Moreover, the implementation MUST ensure correct work +/// of other instances of MDBX_env within the current process, e.g. +/// restore POSIX-fcntl locks after the closing of file descriptors. +/// \return Error code (MDBX_PANIC) or zero on success. +MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, + MDBX_env *inprocess_neighbor); + +/// \brief Connects to shared interprocess locking objects and tries to acquire +/// the maximum lock level (shared if exclusive is not available) +/// Depending on implementation or/and platform (Windows) this function may +/// acquire the non-OS super-level lock (e.g. for shared synchronization +/// objects initialization), which will be downgraded to OS-exclusive or +/// shared via explicit calling of mdbx_lck_downgrade(). +/// \return +/// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus +/// the current process is the first and only after the last use of DB. +/// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus +/// DB has already been opened and now is used by other processes. +/// Otherwise (not 0 and not -1) - error code. +MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); + +/// \brief Downgrades the level of initially acquired lock to +/// operational level specified by argument. The reson for such downgrade: +/// - unblocking of other processes that are waiting for access, i.e. +/// if (env->me_flags & MDBX_EXCLUSIVE) != 0, then other processes +/// should be made aware that access is unavailable rather than +/// wait for it. +/// - freeing locks that interfere file operation (especially for Windows) +/// (env->me_flags & MDBX_EXCLUSIVE) == 0 - downgrade to shared lock. +/// (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive +/// operational lock. +/// \return Error code or zero on success +MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env); + +/// \brief Locks LCK-file or/and table of readers for (de)registering. +/// \return Error code or zero on success +MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env); + +/// \brief Unlocks LCK-file or/and table of readers after (de)registering. +MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env); + +/// \brief Acquires lock for DB change (on writing transaction start) +/// Reading transactions will not be blocked. +/// Declared as LIBMDBX_API because it is used in mdbx_chk. +/// \return Error code or zero on success +LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dont_wait); + +/// \brief Releases lock once DB changes is made (after writing transaction +/// has finished). +/// Declared as LIBMDBX_API because it is used in mdbx_chk. +LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); + +/// \brief Sets alive-flag of reader presence (indicative lock) for PID of +/// the current process. The function does no more than needed for +/// the correct working of mdbx_rpid_check() in other processes. +/// \return Error code or zero on success +MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env); + +/// \brief Resets alive-flag of reader presence (indicative lock) +/// for PID of the current process. The function does no more than needed +/// for the correct working of mdbx_rpid_check() in other processes. +/// \return Error code or zero on success +MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); + +/// \brief Checks for reading process status with the given pid with help of +/// alive-flag of presence (indicative lock) or using another way. +/// \return +/// MDBX_RESULT_TRUE (-1) - if the reader process with the given PID is alive +/// and working with DB (indicative lock is present). +/// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent +/// or not working with DB (indicative lock is not present). +/// Otherwise (not 0 and not -1) - error code. +MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid); + +#if defined(_WIN32) || defined(_WIN64) + +typedef void(WINAPI *MDBX_srwlock_function)(MDBX_srwlock *); +MDBX_INTERNAL_VAR MDBX_srwlock_function mdbx_srwlock_Init, + mdbx_srwlock_AcquireShared, mdbx_srwlock_ReleaseShared, + mdbx_srwlock_AcquireExclusive, mdbx_srwlock_ReleaseExclusive; + +#if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */ +typedef enum _FILE_INFO_BY_HANDLE_CLASS { + FileBasicInfo, + FileStandardInfo, + FileNameInfo, + FileRenameInfo, + FileDispositionInfo, + FileAllocationInfo, + FileEndOfFileInfo, + FileStreamInfo, + FileCompressionInfo, + FileAttributeTagInfo, + FileIdBothDirectoryInfo, + FileIdBothDirectoryRestartInfo, + FileIoPriorityHintInfo, + FileRemoteProtocolInfo, + MaximumFileInfoByHandleClass +} FILE_INFO_BY_HANDLE_CLASS, + *PFILE_INFO_BY_HANDLE_CLASS; + +typedef struct _FILE_END_OF_FILE_INFO { + LARGE_INTEGER EndOfFile; +} FILE_END_OF_FILE_INFO, *PFILE_END_OF_FILE_INFO; + +#define REMOTE_PROTOCOL_INFO_FLAG_LOOPBACK 0x00000001 +#define REMOTE_PROTOCOL_INFO_FLAG_OFFLINE 0x00000002 + +typedef struct _FILE_REMOTE_PROTOCOL_INFO { + USHORT StructureVersion; + USHORT StructureSize; + DWORD Protocol; + USHORT ProtocolMajorVersion; + USHORT ProtocolMinorVersion; + USHORT ProtocolRevision; + USHORT Reserved; + DWORD Flags; + struct { + DWORD Reserved[8]; + } GenericReserved; + struct { + DWORD Reserved[16]; + } ProtocolSpecificReserved; +} FILE_REMOTE_PROTOCOL_INFO, *PFILE_REMOTE_PROTOCOL_INFO; + +#endif /* _WIN32_WINNT < 0x0600 (prior to Windows Vista) */ + +typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)( + _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, + _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); +MDBX_INTERNAL_VAR MDBX_GetFileInformationByHandleEx + mdbx_GetFileInformationByHandleEx; + +typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( + _In_ HANDLE hFile, _Out_opt_ LPWSTR lpVolumeNameBuffer, + _In_ DWORD nVolumeNameSize, _Out_opt_ LPDWORD lpVolumeSerialNumber, + _Out_opt_ LPDWORD lpMaximumComponentLength, + _Out_opt_ LPDWORD lpFileSystemFlags, + _Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize); +MDBX_INTERNAL_VAR MDBX_GetVolumeInformationByHandleW + mdbx_GetVolumeInformationByHandleW; + +typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile, + _Out_ LPWSTR lpszFilePath, + _In_ DWORD cchFilePath, + _In_ DWORD dwFlags); +MDBX_INTERNAL_VAR MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; + +typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)( + _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, + _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); +MDBX_INTERNAL_VAR MDBX_SetFileInformationByHandle + mdbx_SetFileInformationByHandle; + +typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( + IN HANDLE FileHandle, IN OUT HANDLE Event, + IN OUT PVOID /* PIO_APC_ROUTINE */ ApcRoutine, IN OUT PVOID ApcContext, + OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode, + IN OUT PVOID InputBuffer, IN ULONG InputBufferLength, + OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength); +MDBX_INTERNAL_VAR MDBX_NtFsControlFile mdbx_NtFsControlFile; + +typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void); +MDBX_INTERNAL_VAR MDBX_GetTickCount64 mdbx_GetTickCount64; + +#if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8 +typedef struct _WIN32_MEMORY_RANGE_ENTRY { + PVOID VirtualAddress; + SIZE_T NumberOfBytes; +} WIN32_MEMORY_RANGE_ENTRY, *PWIN32_MEMORY_RANGE_ENTRY; +#endif /* Windows 8.x */ + +typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)( + HANDLE hProcess, ULONG_PTR NumberOfEntries, + PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags); +MDBX_INTERNAL_VAR MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; + +typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT; + +typedef NTSTATUS(NTAPI *MDBX_NtExtendSection)(IN HANDLE SectionHandle, + IN PLARGE_INTEGER NewSectionSize); +MDBX_INTERNAL_VAR MDBX_NtExtendSection mdbx_NtExtendSection; + +static __inline bool mdbx_RunningUnderWine(void) { + return !mdbx_NtExtendSection; +} + +typedef LSTATUS(WINAPI *MDBX_RegGetValueA)(HKEY hkey, LPCSTR lpSubKey, + LPCSTR lpValue, DWORD dwFlags, + LPDWORD pdwType, PVOID pvData, + LPDWORD pcbData); +MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; + +#endif /* Windows */ + +#endif /* !__cplusplus */ + +/*----------------------------------------------------------------------------*/ + +#if defined(_MSC_VER) && _MSC_VER >= 1900 +/* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros + * for internal format-args checker. */ +#undef PRIuPTR +#undef PRIiPTR +#undef PRIdPTR +#undef PRIxPTR +#define PRIuPTR "Iu" +#define PRIiPTR "Ii" +#define PRIdPTR "Id" +#define PRIxPTR "Ix" +#define PRIuSIZE "zu" +#define PRIiSIZE "zi" +#define PRIdSIZE "zd" +#define PRIxSIZE "zx" +#endif /* fix PRI*PTR for _MSC_VER */ + +#ifndef PRIuSIZE +#define PRIuSIZE PRIuPTR +#define PRIiSIZE PRIiPTR +#define PRIdSIZE PRIdPTR +#define PRIxSIZE PRIxPTR +#endif /* PRI*SIZE macros for MSVC */ + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +#define mdbx_sourcery_anchor XCONCAT(mdbx_sourcery_, MDBX_BUILD_SOURCERY) +#if defined(xMDBX_TOOLS) +extern LIBMDBX_API const char *const mdbx_sourcery_anchor; +#endif + +/******************************************************************************* + ******************************************************************************* + ******************************************************************************* + * + * + * #### ##### ##### # #### # # #### + * # # # # # # # # ## # # + * # # # # # # # # # # # #### + * # # ##### # # # # # # # # + * # # # # # # # # ## # # + * #### # # # #### # # #### + * + * + */ + +/** \defgroup build_option Build options + * The libmdbx build options. + @{ */ + +/** Using fcntl(F_FULLFSYNC) with 5-10 times slowdown */ +#define MDBX_OSX_WANNA_DURABILITY 0 +/** Using fsync() with chance of data lost on power failure */ +#define MDBX_OSX_WANNA_SPEED 1 + +#ifndef MDBX_OSX_SPEED_INSTEADOF_DURABILITY +/** Choices \ref MDBX_OSX_WANNA_DURABILITY or \ref MDBX_OSX_WANNA_SPEED + * for OSX & iOS */ +#define MDBX_OSX_SPEED_INSTEADOF_DURABILITY MDBX_OSX_WANNA_DURABILITY +#endif /* MDBX_OSX_SPEED_INSTEADOF_DURABILITY */ + +/** Controls checking PID against reuse DB environment after the fork() */ +#ifndef MDBX_ENV_CHECKPID +#if defined(MADV_DONTFORK) || defined(_WIN32) || defined(_WIN64) +/* PID check could be omitted: + * - on Linux when madvise(MADV_DONTFORK) is available, i.e. after the fork() + * mapped pages will not be available for child process. + * - in Windows where fork() not available. */ +#define MDBX_ENV_CHECKPID 0 +#else +#define MDBX_ENV_CHECKPID 1 +#endif +#define MDBX_ENV_CHECKPID_CONFIG "AUTO=" STRINGIFY(MDBX_ENV_CHECKPID) +#else +#define MDBX_ENV_CHECKPID_CONFIG STRINGIFY(MDBX_ENV_CHECKPID) +#endif /* MDBX_ENV_CHECKPID */ + +/** Controls checking transaction owner thread against misuse transactions from + * other threads. */ +#ifndef MDBX_TXN_CHECKOWNER +#define MDBX_TXN_CHECKOWNER 1 +#define MDBX_TXN_CHECKOWNER_CONFIG "AUTO=" STRINGIFY(MDBX_TXN_CHECKOWNER) +#else +#define MDBX_TXN_CHECKOWNER_CONFIG STRINGIFY(MDBX_TXN_CHECKOWNER) +#endif /* MDBX_TXN_CHECKOWNER */ + +/** Does a system have battery-backed Real-Time Clock or just a fake. */ +#ifndef MDBX_TRUST_RTC +#if defined(__linux__) || defined(__gnu_linux__) || defined(__NetBSD__) || \ + defined(__OpenBSD__) +#define MDBX_TRUST_RTC 0 /* a lot of embedded systems have a fake RTC */ +#else +#define MDBX_TRUST_RTC 1 +#endif +#define MDBX_TRUST_RTC_CONFIG "AUTO=" STRINGIFY(MDBX_TRUST_RTC) +#else +#define MDBX_TRUST_RTC_CONFIG STRINGIFY(MDBX_TRUST_RTC) +#endif /* MDBX_TRUST_RTC */ + +/** Controls online database auto-compactification during write-transactions. */ +#ifndef MDBX_ENABLE_REFUND +#define MDBX_ENABLE_REFUND 1 +#elif !(MDBX_ENABLE_REFUND == 0 || MDBX_ENABLE_REFUND == 1) +#error MDBX_ENABLE_REFUND must be defined as 0 or 1 +#endif /* MDBX_ENABLE_REFUND */ + +/** Controls gathering statistics for page operations. */ +#ifndef MDBX_ENABLE_PGOP_STAT +#define MDBX_ENABLE_PGOP_STAT 1 +#elif !(MDBX_ENABLE_PGOP_STAT == 0 || MDBX_ENABLE_PGOP_STAT == 1) +#error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 +#endif /* MDBX_ENABLE_PGOP_STAT */ + +/** Controls use of POSIX madvise() hints and friends. */ +#ifndef MDBX_ENABLE_MADVISE +#define MDBX_ENABLE_MADVISE 1 +#elif !(MDBX_ENABLE_MADVISE == 0 || MDBX_ENABLE_MADVISE == 1) +#error MDBX_ENABLE_MADVISE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_MADVISE */ + +/** Disable some checks to reduce an overhead and detection probability of + * database corruption to a values closer to the LMDB. */ +#ifndef MDBX_DISABLE_PAGECHECKS +#define MDBX_DISABLE_PAGECHECKS 0 +#elif !(MDBX_DISABLE_PAGECHECKS == 0 || MDBX_DISABLE_PAGECHECKS == 1) +#error MDBX_DISABLE_PAGECHECKS must be defined as 0 or 1 +#endif /* MDBX_DISABLE_PAGECHECKS */ + +#ifndef MDBX_PNL_PREALLOC_FOR_RADIXSORT +#define MDBX_PNL_PREALLOC_FOR_RADIXSORT 1 +#elif !(MDBX_PNL_PREALLOC_FOR_RADIXSORT == 0 || \ + MDBX_PNL_PREALLOC_FOR_RADIXSORT == 1) +#error MDBX_PNL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1 +#endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */ + +#ifndef MDBX_DPL_PREALLOC_FOR_RADIXSORT +#define MDBX_DPL_PREALLOC_FOR_RADIXSORT 1 +#elif !(MDBX_DPL_PREALLOC_FOR_RADIXSORT == 0 || \ + MDBX_DPL_PREALLOC_FOR_RADIXSORT == 1) +#error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1 +#endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ + +/* Basically, this build-option is for TODO. Guess it should be replaced + * with MDBX_ENABLE_WRITEMAP_SPILLING with the three variants: + * 0/OFF = Don't track dirty pages at all and don't spilling ones. + * This should be by-default on Linux and may-be other systems + * (not sure: Darwin/OSX, FreeBSD, Windows 10) where kernel provides + * properly LRU tracking and async writing on-demand. + * 1/ON = Lite tracking of dirty pages but with LRU labels and explicit + * spilling with msync(MS_ASYNC). */ +#ifndef MDBX_FAKE_SPILL_WRITEMAP +#if defined(__linux__) || defined(__gnu_linux__) +#define MDBX_FAKE_SPILL_WRITEMAP 1 /* msync(MS_ASYNC) is no-op on Linux */ +#else +#define MDBX_FAKE_SPILL_WRITEMAP 0 +#endif +#elif !(MDBX_FAKE_SPILL_WRITEMAP == 0 || MDBX_FAKE_SPILL_WRITEMAP == 1) +#error MDBX_FAKE_SPILL_WRITEMAP must be defined as 0 or 1 +#endif /* MDBX_FAKE_SPILL_WRITEMAP */ + +/** Controls sort order of internal page number lists. + * This mostly experimental/advanced option with not for regular MDBX users. + * \warning The database format depend on this option and libmdbx builded with + * different option value are incompatible. */ +#ifndef MDBX_PNL_ASCENDING +#define MDBX_PNL_ASCENDING 0 +#elif !(MDBX_PNL_ASCENDING == 0 || MDBX_PNL_ASCENDING == 1) +#error MDBX_PNL_ASCENDING must be defined as 0 or 1 +#endif /* MDBX_PNL_ASCENDING */ + +/** Avoid dependence from MSVC CRT and use ntdll.dll instead. */ +#ifndef MDBX_WITHOUT_MSVC_CRT +#define MDBX_WITHOUT_MSVC_CRT 1 +#elif !(MDBX_WITHOUT_MSVC_CRT == 0 || MDBX_WITHOUT_MSVC_CRT == 1) +#error MDBX_WITHOUT_MSVC_CRT must be defined as 0 or 1 +#endif /* MDBX_WITHOUT_MSVC_CRT */ + +/** Size of buffer used during copying a environment/database file. */ +#ifndef MDBX_ENVCOPY_WRITEBUF +#define MDBX_ENVCOPY_WRITEBUF 1048576u +#elif MDBX_ENVCOPY_WRITEBUF < 65536u || MDBX_ENVCOPY_WRITEBUF > 1073741824u || \ + MDBX_ENVCOPY_WRITEBUF % 65536u +#error MDBX_ENVCOPY_WRITEBUF must be defined in range 65536..1073741824 and be multiple of 65536 +#endif /* MDBX_ENVCOPY_WRITEBUF */ + +/** Forces assertion checking */ +#ifndef MDBX_FORCE_ASSERTIONS +#define MDBX_FORCE_ASSERTIONS 0 +#elif !(MDBX_FORCE_ASSERTIONS == 0 || MDBX_FORCE_ASSERTIONS == 1) +#error MDBX_FORCE_ASSERTIONS must be defined as 0 or 1 +#endif /* MDBX_FORCE_ASSERTIONS */ + +/** Presumed malloc size overhead for each allocation + * to adjust allocations to be more aligned. */ +#ifndef MDBX_ASSUME_MALLOC_OVERHEAD +#ifdef __SIZEOF_POINTER__ +#define MDBX_ASSUME_MALLOC_OVERHEAD (__SIZEOF_POINTER__ * 2u) +#else +#define MDBX_ASSUME_MALLOC_OVERHEAD (sizeof(void *) * 2u) +#endif +#elif MDBX_ASSUME_MALLOC_OVERHEAD < 0 || MDBX_ASSUME_MALLOC_OVERHEAD > 64 || \ + MDBX_ASSUME_MALLOC_OVERHEAD % 4 +#error MDBX_ASSUME_MALLOC_OVERHEAD must be defined in range 0..64 and be multiple of 4 +#endif /* MDBX_ASSUME_MALLOC_OVERHEAD */ + +/** In case the MDBX_DEBUG is undefined set it corresponding to NDEBUG */ +#ifndef MDBX_DEBUG +#ifdef NDEBUG +#define MDBX_DEBUG 0 +#else +#define MDBX_DEBUG 1 +#endif +#endif /* MDBX_DEBUG */ + +/** If defined then enables integration with Valgrind, + * a memory analyzing tool. */ +#ifndef MDBX_USE_VALGRIND +#endif /* MDBX_USE_VALGRIND */ + +/** If defined then enables use C11 atomics, + * otherwise detects ones availability automatically. */ +#ifndef MDBX_HAVE_C11ATOMICS +#endif /* MDBX_HAVE_C11ATOMICS */ + +//------------------------------------------------------------------------------ + +/** Win32 File Locking API for \ref MDBX_LOCKING */ +#define MDBX_LOCKING_WIN32FILES -1 + +/** SystemV IPC semaphores for \ref MDBX_LOCKING */ +#define MDBX_LOCKING_SYSV 5 + +/** POSIX-1 Shared anonymous semaphores for \ref MDBX_LOCKING */ +#define MDBX_LOCKING_POSIX1988 1988 + +/** POSIX-2001 Shared Mutexes for \ref MDBX_LOCKING */ +#define MDBX_LOCKING_POSIX2001 2001 + +/** POSIX-2008 Robust Mutexes for \ref MDBX_LOCKING */ +#define MDBX_LOCKING_POSIX2008 2008 + +/** BeOS Benaphores, aka Futexes for \ref MDBX_LOCKING */ +#define MDBX_LOCKING_BENAPHORE 1995 + +/** Advanced: Choices the locking implementation (autodetection by default). */ +#if defined(_WIN32) || defined(_WIN64) +#define MDBX_LOCKING MDBX_LOCKING_WIN32FILES +#else +#ifndef MDBX_LOCKING +#if defined(_POSIX_THREAD_PROCESS_SHARED) && \ + _POSIX_THREAD_PROCESS_SHARED >= 200112L && !defined(__FreeBSD__) + +/* Some platforms define the EOWNERDEAD error code even though they + * don't support Robust Mutexes. If doubt compile with -MDBX_LOCKING=2001. */ +#if defined(EOWNERDEAD) && _POSIX_THREAD_PROCESS_SHARED >= 200809L && \ + ((defined(_POSIX_THREAD_ROBUST_PRIO_INHERIT) && \ + _POSIX_THREAD_ROBUST_PRIO_INHERIT > 0) || \ + (defined(_POSIX_THREAD_ROBUST_PRIO_PROTECT) && \ + _POSIX_THREAD_ROBUST_PRIO_PROTECT > 0) || \ + defined(PTHREAD_MUTEX_ROBUST) || defined(PTHREAD_MUTEX_ROBUST_NP)) && \ + (!defined(__GLIBC__) || \ + __GLIBC_PREREQ(2, 10) /* troubles with Robust mutexes before 2.10 */) +#define MDBX_LOCKING MDBX_LOCKING_POSIX2008 +#else +#define MDBX_LOCKING MDBX_LOCKING_POSIX2001 +#endif +#elif defined(__sun) || defined(__SVR4) || defined(__svr4__) +#define MDBX_LOCKING MDBX_LOCKING_POSIX1988 +#else +#define MDBX_LOCKING MDBX_LOCKING_SYSV +#endif +#define MDBX_LOCKING_CONFIG "AUTO=" STRINGIFY(MDBX_LOCKING) +#else +#define MDBX_LOCKING_CONFIG STRINGIFY(MDBX_LOCKING) +#endif /* MDBX_LOCKING */ +#endif /* !Windows */ + +/** Advanced: Using POSIX OFD-locks (autodetection by default). */ +#ifndef MDBX_USE_OFDLOCKS +#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) && \ + !defined(MDBX_SAFE4QEMU) && \ + !defined(__sun) /* OFD-lock are broken on Solaris */ +#define MDBX_USE_OFDLOCKS 1 +#else +#define MDBX_USE_OFDLOCKS 0 +#endif +#define MDBX_USE_OFDLOCKS_CONFIG "AUTO=" STRINGIFY(MDBX_USE_OFDLOCKS) +#else +#define MDBX_USE_OFDLOCKS_CONFIG STRINGIFY(MDBX_USE_OFDLOCKS) +#endif /* MDBX_USE_OFDLOCKS */ + +/** Advanced: Using sendfile() syscall (autodetection by default). */ +#ifndef MDBX_USE_SENDFILE +#if ((defined(__linux__) || defined(__gnu_linux__)) && \ + !defined(__ANDROID_API__)) || \ + (defined(__ANDROID_API__) && __ANDROID_API__ >= 21) +#define MDBX_USE_SENDFILE 1 +#else +#define MDBX_USE_SENDFILE 0 +#endif +#endif /* MDBX_USE_SENDFILE */ + +/** Advanced: Using copy_file_range() syscall (autodetection by default). */ +#ifndef MDBX_USE_COPYFILERANGE +#if __GLIBC_PREREQ(2, 27) && defined(_GNU_SOURCE) +#define MDBX_USE_COPYFILERANGE 1 +#else +#define MDBX_USE_COPYFILERANGE 0 +#endif +#endif /* MDBX_USE_COPYFILERANGE */ + +/** Advanced: Using sync_file_range() syscall (autodetection by default). */ +#ifndef MDBX_USE_SYNCFILERANGE +#if ((defined(__linux__) || defined(__gnu_linux__)) && \ + defined(SYNC_FILE_RANGE_WRITE) && !defined(__ANDROID_API__)) || \ + (defined(__ANDROID_API__) && __ANDROID_API__ >= 26) +#define MDBX_USE_SYNCFILERANGE 1 +#else +#define MDBX_USE_SYNCFILERANGE 0 +#endif +#endif /* MDBX_USE_SYNCFILERANGE */ + +//------------------------------------------------------------------------------ + +#ifndef MDBX_CPU_WRITEBACK_INCOHERENT +#if defined(__ia32__) || defined(__e2k__) || defined(__hppa) || \ + defined(__hppa__) || defined(DOXYGEN) +#define MDBX_CPU_WRITEBACK_INCOHERENT 0 +#else +#define MDBX_CPU_WRITEBACK_INCOHERENT 1 +#endif +#endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ + +#ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE +#ifdef __OpenBSD__ +#define MDBX_MMAP_INCOHERENT_FILE_WRITE 1 +#else +#define MDBX_MMAP_INCOHERENT_FILE_WRITE 0 +#endif +#endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ + +#ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE +#if defined(__mips) || defined(__mips__) || defined(__mips64) || \ + defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \ + defined(__MWERKS__) || defined(__sgi) +/* MIPS has cache coherency issues. */ +#define MDBX_MMAP_INCOHERENT_CPU_CACHE 1 +#else +/* LY: assume no relevant mmap/dcache issues. */ +#define MDBX_MMAP_INCOHERENT_CPU_CACHE 0 +#endif +#endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ + +#ifndef MDBX_64BIT_ATOMIC +#if MDBX_WORDBITS >= 64 || defined(DOXYGEN) +#define MDBX_64BIT_ATOMIC 1 +#else +#define MDBX_64BIT_ATOMIC 0 +#endif +#define MDBX_64BIT_ATOMIC_CONFIG "AUTO=" STRINGIFY(MDBX_64BIT_ATOMIC) +#else +#define MDBX_64BIT_ATOMIC_CONFIG STRINGIFY(MDBX_64BIT_ATOMIC) +#endif /* MDBX_64BIT_ATOMIC */ + +#ifndef MDBX_64BIT_CAS +#if defined(ATOMIC_LLONG_LOCK_FREE) +#if ATOMIC_LLONG_LOCK_FREE > 1 +#define MDBX_64BIT_CAS 1 +#else +#define MDBX_64BIT_CAS 0 +#endif +#elif defined(__GCC_ATOMIC_LLONG_LOCK_FREE) +#if __GCC_ATOMIC_LLONG_LOCK_FREE > 1 +#define MDBX_64BIT_CAS 1 +#else +#define MDBX_64BIT_CAS 0 +#endif +#elif defined(__CLANG_ATOMIC_LLONG_LOCK_FREE) +#if __CLANG_ATOMIC_LLONG_LOCK_FREE > 1 +#define MDBX_64BIT_CAS 1 +#else +#define MDBX_64BIT_CAS 0 +#endif +#elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN) +#define MDBX_64BIT_CAS 1 +#else +#define MDBX_64BIT_CAS MDBX_64BIT_ATOMIC +#endif +#define MDBX_64BIT_CAS_CONFIG "AUTO=" STRINGIFY(MDBX_64BIT_CAS) +#else +#define MDBX_64BIT_CAS_CONFIG STRINGIFY(MDBX_64BIT_CAS) +#endif /* MDBX_64BIT_CAS */ + +#ifndef MDBX_UNALIGNED_OK +#ifdef _MSC_VER +#define MDBX_UNALIGNED_OK 1 /* avoid MSVC misoptimization */ +#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) +#define MDBX_UNALIGNED_OK 0 /* expecting optimization is well done */ +#elif (defined(__ia32__) || defined(__ARM_FEATURE_UNALIGNED)) && \ + !defined(__ALIGNED__) +#define MDBX_UNALIGNED_OK 1 +#else +#define MDBX_UNALIGNED_OK 0 +#endif +#endif /* MDBX_UNALIGNED_OK */ + +#ifndef MDBX_CACHELINE_SIZE +#if defined(SYSTEM_CACHE_ALIGNMENT_SIZE) +#define MDBX_CACHELINE_SIZE SYSTEM_CACHE_ALIGNMENT_SIZE +#elif defined(__ia64__) || defined(__ia64) || defined(_M_IA64) +#define MDBX_CACHELINE_SIZE 128 +#else +#define MDBX_CACHELINE_SIZE 64 +#endif +#endif /* MDBX_CACHELINE_SIZE */ + +/** @} end of build options */ +/******************************************************************************* + ******************************************************************************* + ******************************************************************************/ + +#ifdef DOXYGEN +/* !!! Actually this is a fake definitions !!! + * !!! for documentation generation by Doxygen !!! */ + +/** Controls enabling of debugging features. + * + * - `MDBX_DEBUG = 0` (by default) Disables any debugging features at all, + * including logging and assertion controls. + * Logging level and corresponding debug flags changing + * by \ref mdbx_setup_debug() will not have effect. + * - `MDBX_DEBUG > 0` Enables code for the debugging features (logging, + * assertions checking and internal audit). + * Simultaneously sets the default logging level + * to the `MDBX_DEBUG` value. + * Also enables \ref MDBX_DBG_AUDIT if `MDBX_DEBUG >= 2`. + * + * \ingroup build_option */ +#define MDBX_DEBUG 0...7 + +/** Disables using of GNU libc extensions. */ +#define MDBX_DISABLE_GNU_SOURCE 0 or 1 + +#endif /* DOXYGEN */ + +/* Undefine the NDEBUG if debugging is enforced by MDBX_DEBUG */ +#if MDBX_DEBUG +#undef NDEBUG +#endif + +/*----------------------------------------------------------------------------*/ +/* Atomics */ + +enum MDBX_memory_order { + mo_Relaxed, + mo_AcquireRelease, + mo_SequentialConsistency +}; + +typedef union { + volatile uint32_t weak; +#ifdef MDBX_HAVE_C11ATOMICS + volatile _Atomic uint32_t c11a; +#endif /* MDBX_HAVE_C11ATOMICS */ +} MDBX_atomic_uint32_t; + +typedef union { + volatile uint64_t weak; +#if defined(MDBX_HAVE_C11ATOMICS) && (MDBX_64BIT_CAS || MDBX_64BIT_ATOMIC) + volatile _Atomic uint64_t c11a; +#endif +#if !defined(MDBX_HAVE_C11ATOMICS) || !MDBX_64BIT_CAS || !MDBX_64BIT_ATOMIC + __anonymous_struct_extension__ struct { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + MDBX_atomic_uint32_t low, high; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + MDBX_atomic_uint32_t high, low; +#else +#error "FIXME: Unsupported byte order" +#endif /* __BYTE_ORDER__ */ + }; +#endif +} MDBX_atomic_uint64_t; + +#ifdef MDBX_HAVE_C11ATOMICS + +/* Crutches for C11 atomic compiler's bugs */ +#if defined(__e2k__) && defined(__LCC__) && __LCC__ < /* FIXME */ 127 +#define MDBX_c11a_ro(type, ptr) (&(ptr)->weak) +#define MDBX_c11a_rw(type, ptr) (&(ptr)->weak) +#elif defined(__clang__) && __clang__ < 8 +#define MDBX_c11a_ro(type, ptr) ((volatile _Atomic(type) *)&(ptr)->c11a) +#define MDBX_c11a_rw(type, ptr) (&(ptr)->c11a) +#else +#define MDBX_c11a_ro(type, ptr) (&(ptr)->c11a) +#define MDBX_c11a_rw(type, ptr) (&(ptr)->c11a) +#endif /* Crutches for C11 atomic compiler's bugs */ + +static __always_inline memory_order mo_c11_store(enum MDBX_memory_order fence) { + switch (fence) { + default: + assert(false); + __unreachable(); + case mo_Relaxed: + return memory_order_relaxed; + case mo_AcquireRelease: + return memory_order_release; + case mo_SequentialConsistency: + return memory_order_seq_cst; + } +} + +static __always_inline memory_order mo_c11_load(enum MDBX_memory_order fence) { + switch (fence) { + default: + assert(false); + __unreachable(); + case mo_Relaxed: + return memory_order_relaxed; + case mo_AcquireRelease: + return memory_order_acquire; + case mo_SequentialConsistency: + return memory_order_seq_cst; + } +} +#endif /* MDBX_HAVE_C11ATOMICS */ + +#ifndef __cplusplus + +static __inline void mdbx_jitter4testing(bool tiny); + +MDBX_MAYBE_UNUSED static __always_inline void +mdbx_memory_fence(enum MDBX_memory_order order, bool write) { +#ifdef MDBX_HAVE_C11ATOMICS + atomic_thread_fence(write ? mo_c11_store(order) : mo_c11_load(order)); +#else /* MDBX_HAVE_C11ATOMICS */ + mdbx_compiler_barrier(); + if (write && + order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed : mo_AcquireRelease)) + mdbx_memory_barrier(); +#endif /* MDBX_HAVE_C11ATOMICS */ +} + +MDBX_MAYBE_UNUSED static __always_inline uint32_t +atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, + enum MDBX_memory_order order) { + STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); +#ifdef MDBX_HAVE_C11ATOMICS + assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); + atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order)); +#else /* MDBX_HAVE_C11ATOMICS */ + if (order != mo_Relaxed) + mdbx_compiler_barrier(); + p->weak = value; + mdbx_memory_fence(order, true); +#endif /* MDBX_HAVE_C11ATOMICS */ + return value; +} + +MDBX_MAYBE_UNUSED static __always_inline uint32_t +atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { + STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); +#ifdef MDBX_HAVE_C11ATOMICS + assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p))); + return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order)); +#else /* MDBX_HAVE_C11ATOMICS */ + mdbx_memory_fence(order, false); + const uint32_t value = p->weak; + if (order != mo_Relaxed) + mdbx_compiler_barrier(); + return value; +#endif /* MDBX_HAVE_C11ATOMICS */ +} + +MDBX_MAYBE_UNUSED static __always_inline uint64_t +atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, + enum MDBX_memory_order order) { + STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8); +#if MDBX_64BIT_ATOMIC +#ifdef MDBX_HAVE_C11ATOMICS + assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p))); + atomic_store_explicit(MDBX_c11a_rw(uint64_t, p), value, mo_c11_store(order)); +#else /* MDBX_HAVE_C11ATOMICS */ + if (order != mo_Relaxed) + mdbx_compiler_barrier(); + p->weak = value; + mdbx_memory_fence(order, true); +#endif /* MDBX_HAVE_C11ATOMICS */ +#else /* !MDBX_64BIT_ATOMIC */ + mdbx_compiler_barrier(); + atomic_store32(&p->low, (uint32_t)value, mo_Relaxed); + mdbx_jitter4testing(true); + atomic_store32(&p->high, (uint32_t)(value >> 32), order); + mdbx_jitter4testing(true); +#endif /* !MDBX_64BIT_ATOMIC */ + return value; +} + +MDBX_MAYBE_UNUSED static +#if MDBX_64BIT_ATOMIC + __always_inline +#endif /* MDBX_64BIT_ATOMIC */ + uint64_t + atomic_load64(const MDBX_atomic_uint64_t *p, + enum MDBX_memory_order order) { + STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8); +#if MDBX_64BIT_ATOMIC +#ifdef MDBX_HAVE_C11ATOMICS + assert(atomic_is_lock_free(MDBX_c11a_ro(uint64_t, p))); + return atomic_load_explicit(MDBX_c11a_ro(uint64_t, p), mo_c11_load(order)); +#else /* MDBX_HAVE_C11ATOMICS */ + mdbx_memory_fence(order, false); + const uint64_t value = p->weak; + if (order != mo_Relaxed) + mdbx_compiler_barrier(); + return value; +#endif /* MDBX_HAVE_C11ATOMICS */ +#else /* !MDBX_64BIT_ATOMIC */ + mdbx_compiler_barrier(); + uint64_t value = (uint64_t)atomic_load32(&p->high, order) << 32; + mdbx_jitter4testing(true); + value |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed + : mo_AcquireRelease); + mdbx_jitter4testing(true); + for (;;) { + mdbx_compiler_barrier(); + uint64_t again = (uint64_t)atomic_load32(&p->high, order) << 32; + mdbx_jitter4testing(true); + again |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed + : mo_AcquireRelease); + mdbx_jitter4testing(true); + if (likely(value == again)) + return value; + value = again; + } +#endif /* !MDBX_64BIT_ATOMIC */ +} + +#endif /* !__cplusplus */ + +/*----------------------------------------------------------------------------*/ +/* Basic constants and types */ + +/* A stamp that identifies a file as an MDBX file. + * There's nothing special about this value other than that it is easily + * recognizable, and it will reflect any byte order mismatches. */ +#define MDBX_MAGIC UINT64_C(/* 56-bit prime */ 0x59659DBDEF4C11) + +/* FROZEN: The version number for a database's datafile format. */ +#define MDBX_DATA_VERSION 2 +/* The version number for a database's lockfile format. */ +#define MDBX_LOCK_VERSION 4 + +/* handle for the DB used to track free pages. */ +#define FREE_DBI 0 +/* handle for the default DB. */ +#define MAIN_DBI 1 +/* Number of DBs in metapage (free and main) - also hardcoded elsewhere */ +#define CORE_DBS 2 + +/* Number of meta pages - also hardcoded elsewhere */ +#define NUM_METAS 3 + +/* A page number in the database. + * + * MDBX uses 32 bit for page numbers. This limits database + * size up to 2^44 bytes, in case of 4K pages. */ +typedef uint32_t pgno_t; +typedef MDBX_atomic_uint32_t atomic_pgno_t; +#define PRIaPGNO PRIu32 +#define MAX_PAGENO UINT32_C(0x7FFFffff) +#define MIN_PAGENO NUM_METAS + +#define SAFE64_INVALID_THRESHOLD UINT64_C(0xffffFFFF00000000) + +/* A transaction ID. */ +typedef uint64_t txnid_t; +typedef MDBX_atomic_uint64_t atomic_txnid_t; +#define PRIaTXN PRIi64 +#define MIN_TXNID UINT64_C(1) +#define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1) +#define INITIAL_TXNID (MIN_TXNID + NUM_METAS - 1) +#define INVALID_TXNID UINT64_MAX +/* LY: for testing non-atomic 64-bit txnid on 32-bit arches. + * #define xMDBX_TXNID_STEP (UINT32_MAX / 3) */ +#ifndef xMDBX_TXNID_STEP +#if MDBX_64BIT_CAS +#define xMDBX_TXNID_STEP 1u +#else +#define xMDBX_TXNID_STEP 2u +#endif +#endif /* xMDBX_TXNID_STEP */ + +/* Used for offsets within a single page. + * Since memory pages are typically 4 or 8KB in size, 12-13 bits, + * this is plenty. */ +typedef uint16_t indx_t; + +#define MEGABYTE ((size_t)1 << 20) + +/*----------------------------------------------------------------------------*/ +/* Core structures for database and shared memory (i.e. format definition) */ +#pragma pack(push, 1) + +/* Information about a single database in the environment. */ +typedef struct MDBX_db { + uint16_t md_flags; /* see mdbx_dbi_open */ + uint16_t md_depth; /* depth of this tree */ + uint32_t md_xsize; /* key-size for MDBX_DUPFIXED (LEAF2 pages) */ + pgno_t md_root; /* the root page of this tree */ + pgno_t md_branch_pages; /* number of internal pages */ + pgno_t md_leaf_pages; /* number of leaf pages */ + pgno_t md_overflow_pages; /* number of overflow pages */ + uint64_t md_seq; /* table sequence counter */ + uint64_t md_entries; /* number of data items */ + uint64_t md_mod_txnid; /* txnid of last committed modification */ +} MDBX_db; + +/* database size-related parameters */ +typedef struct MDBX_geo { + uint16_t grow_pv; /* datafile growth step as a 16-bit packed (exponential + quantized) value */ + uint16_t shrink_pv; /* datafile shrink threshold as a 16-bit packed + (exponential quantized) value */ + pgno_t lower; /* minimal size of datafile in pages */ + pgno_t upper; /* maximal size of datafile in pages */ + pgno_t now; /* current size of datafile in pages */ + pgno_t next; /* first unused page in the datafile, + but actually the file may be shorter. */ +} MDBX_geo; + +/* Meta page content. + * A meta page is the start point for accessing a database snapshot. + * Pages 0-1 are meta pages. Transaction N writes meta page (N % 2). */ +typedef struct MDBX_meta { + /* Stamp identifying this as an MDBX file. + * It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */ + uint32_t mm_magic_and_version[2]; + + /* txnid that committed this page, the first of a two-phase-update pair */ + uint32_t mm_txnid_a[2]; + + uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ + uint8_t mm_validator_id; /* ID of checksum and page validation method, + * zero (nothing) for now */ + uint8_t mm_extra_pagehdr; /* extra bytes in the page header, + * zero (nothing) for now */ + + MDBX_geo mm_geo; /* database size-related parameters */ + + MDBX_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */ + /* The size of pages used in this DB */ +#define mm_psize mm_dbs[FREE_DBI].md_xsize +/* Any persistent environment flags, see mdbx_env */ +#define mm_flags mm_dbs[FREE_DBI].md_flags + MDBX_canary mm_canary; + +#define MDBX_DATASIGN_NONE 0u +#define MDBX_DATASIGN_WEAK 1u +#define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) +#define META_IS_STEADY(meta) \ + SIGN_IS_STEADY(unaligned_peek_u64(4, (meta)->mm_datasync_sign)) + uint32_t mm_datasync_sign[2]; + + /* txnid that committed this page, the second of a two-phase-update pair */ + uint32_t mm_txnid_b[2]; + + /* Number of non-meta pages which were put in GC after COW. May be 0 in case + * DB was previously handled by libmdbx without corresponding feature. + * This value in couple with mr_snapshot_pages_retired allows fast estimation + * of "how much reader is restraining GC recycling". */ + uint32_t mm_pages_retired[2]; + + /* The analogue /proc/sys/kernel/random/boot_id or similar to determine + * whether the system was rebooted after the last use of the database files. + * If there was no reboot, but there is no need to rollback to the last + * steady sync point. Zeros mean that no relevant information is available + * from the system. */ + bin128_t mm_bootid; + +} MDBX_meta; + +/* Common header for all page types. The page type depends on mp_flags. + * + * P_BRANCH and P_LEAF pages have unsorted 'MDBX_node's at the end, with + * sorted mp_ptrs[] entries referring to them. Exception: P_LEAF2 pages + * omit mp_ptrs and pack sorted MDBX_DUPFIXED values after the page header. + * + * P_OVERFLOW records occupy one or more contiguous pages where only the + * first has a page header. They hold the real data of F_BIGDATA nodes. + * + * P_SUBP sub-pages are small leaf "pages" with duplicate data. + * A node with flag F_DUPDATA but not F_SUBDATA contains a sub-page. + * (Duplicate data can also go in sub-databases, which use normal pages.) + * + * P_META pages contain MDBX_meta, the start point of an MDBX snapshot. + * + * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once + * in the snapshot: Either used by a database or listed in a GC record. */ +typedef struct MDBX_page { + union { +#define IS_FROZEN(txn, p) ((p)->mp_txnid < (txn)->mt_txnid) +#define IS_SPILLED(txn, p) ((p)->mp_txnid == (txn)->mt_txnid) +#define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) +#define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) +#define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) + uint64_t mp_txnid; + struct MDBX_page *mp_next; /* for in-memory list of freed pages */ + }; + uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ +#define P_BRANCH 0x01 /* branch page */ +#define P_LEAF 0x02 /* leaf page */ +#define P_OVERFLOW 0x04 /* overflow page */ +#define P_META 0x08 /* meta page */ +#define P_BAD 0x10 /* explicit flag for invalid/bad page */ +#define P_LEAF2 0x20 /* for MDBX_DUPFIXED records */ +#define P_SUBP 0x40 /* for MDBX_DUPSORT sub-pages */ +#define P_SPILLED 0x2000 /* spilled in parent txn */ +#define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */ +#define P_FROZEN 0x8000 /* used for retire page with known status */ +#define P_ILL_BITS (~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) + uint16_t mp_flags; + union { + uint32_t mp_pages; /* number of overflow pages */ + __anonymous_struct_extension__ struct { + indx_t mp_lower; /* lower bound of free space */ + indx_t mp_upper; /* upper bound of free space */ + }; + }; + pgno_t mp_pgno; /* page number */ + +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ + (!defined(__cplusplus) && defined(_MSC_VER)) + indx_t mp_ptrs[] /* dynamic size */; +#endif /* C99 */ +} MDBX_page; + +/* Size of the page header, excluding dynamic data at the end */ +#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs)) + +#pragma pack(pop) + +#if MDBX_ENABLE_PGOP_STAT +/* Statistics of page operations overall of all (running, completed and aborted) + * transactions */ +typedef struct { + MDBX_atomic_uint64_t newly; /* Quantity of a new pages added */ + MDBX_atomic_uint64_t cow; /* Quantity of pages copied for update */ + MDBX_atomic_uint64_t clone; /* Quantity of parent's dirty pages clones + for nested transactions */ + MDBX_atomic_uint64_t split; /* Page splits */ + MDBX_atomic_uint64_t merge; /* Page merges */ + MDBX_atomic_uint64_t spill; /* Quantity of spilled dirty pages */ + MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */ + MDBX_atomic_uint64_t + wops; /* Number of explicit write operations (not a pages) to a disk */ +} MDBX_pgop_stat_t; +#endif /* MDBX_ENABLE_PGOP_STAT */ + +#if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES +#define MDBX_CLOCK_SIGN UINT32_C(0xF10C) +typedef void mdbx_ipclock_t; +#elif MDBX_LOCKING == MDBX_LOCKING_SYSV + +#define MDBX_CLOCK_SIGN UINT32_C(0xF18D) +typedef mdbx_pid_t mdbx_ipclock_t; +#ifndef EOWNERDEAD +#define EOWNERDEAD MDBX_RESULT_TRUE +#endif + +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 +#define MDBX_CLOCK_SIGN UINT32_C(0x8017) +typedef pthread_mutex_t mdbx_ipclock_t; +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 +#define MDBX_CLOCK_SIGN UINT32_C(0xFC29) +typedef sem_t mdbx_ipclock_t; +#else +#error "FIXME" +#endif /* MDBX_LOCKING */ + +#if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) +MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc); +#endif /* MDBX_LOCKING */ + +/* Reader Lock Table + * + * Readers don't acquire any locks for their data access. Instead, they + * simply record their transaction ID in the reader table. The reader + * mutex is needed just to find an empty slot in the reader table. The + * slot's address is saved in thread-specific data so that subsequent + * read transactions started by the same thread need no further locking to + * proceed. + * + * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data. + * No reader table is used if the database is on a read-only filesystem. + * + * Since the database uses multi-version concurrency control, readers don't + * actually need any locking. This table is used to keep track of which + * readers are using data from which old transactions, so that we'll know + * when a particular old transaction is no longer in use. Old transactions + * that have discarded any data pages can then have those pages reclaimed + * for use by a later write transaction. + * + * The lock table is constructed such that reader slots are aligned with the + * processor's cache line size. Any slot is only ever used by one thread. + * This alignment guarantees that there will be no contention or cache + * thrashing as threads update their own slot info, and also eliminates + * any need for locking when accessing a slot. + * + * A writer thread will scan every slot in the table to determine the oldest + * outstanding reader transaction. Any freed pages older than this will be + * reclaimed by the writer. The writer doesn't use any locks when scanning + * this table. This means that there's no guarantee that the writer will + * see the most up-to-date reader info, but that's not required for correct + * operation - all we need is to know the upper bound on the oldest reader, + * we don't care at all about the newest reader. So the only consequence of + * reading stale information here is that old pages might hang around a + * while longer before being reclaimed. That's actually good anyway, because + * the longer we delay reclaiming old pages, the more likely it is that a + * string of contiguous pages can be found after coalescing old pages from + * many old transactions together. */ + +/* The actual reader record, with cacheline padding. */ +typedef struct MDBX_reader { + /* Current Transaction ID when this transaction began, or (txnid_t)-1. + * Multiple readers that start at the same time will probably have the + * same ID here. Again, it's not important to exclude them from + * anything; all we need to know is which version of the DB they + * started from so we can avoid overwriting any data used in that + * particular version. */ + MDBX_atomic_uint64_t /* txnid_t */ mr_txnid; + + /* The information we store in a single slot of the reader table. + * In addition to a transaction ID, we also record the process and + * thread ID that owns a slot, so that we can detect stale information, + * e.g. threads or processes that went away without cleaning up. + * + * NOTE: We currently don't check for stale records. + * We simply re-init the table when we know that we're the only process + * opening the lock file. */ + + /* The thread ID of the thread owning this txn. */ + MDBX_atomic_uint64_t mr_tid; + + /* The process ID of the process owning this reader txn. */ + MDBX_atomic_uint32_t mr_pid; + + /* The number of pages used in the reader's MVCC snapshot, + * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */ + atomic_pgno_t mr_snapshot_pages_used; + /* Number of retired pages at the time this reader starts transaction. So, + * at any time the difference mm_pages_retired - mr_snapshot_pages_retired + * will give the number of pages which this reader restraining from reuse. */ + MDBX_atomic_uint64_t mr_snapshot_pages_retired; +} MDBX_reader; + +/* The header for the reader table (a memory-mapped lock file). */ +typedef struct MDBX_lockinfo { + /* Stamp identifying this as an MDBX file. + * It must be set to MDBX_MAGIC with with MDBX_LOCK_VERSION. */ + uint64_t mti_magic_and_version; + + /* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */ + uint32_t mti_os_and_format; + + /* Flags which environment was opened. */ + MDBX_atomic_uint32_t mti_envmode; + + /* Threshold of un-synced-with-disk pages for auto-sync feature, + * zero means no-threshold, i.e. auto-sync is disabled. */ + atomic_pgno_t mti_autosync_threshold; + + /* Low 32-bit of txnid with which meta-pages was synced, + * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ + MDBX_atomic_uint32_t mti_meta_sync_txnid; + + /* Period for timed auto-sync feature, i.e. at the every steady checkpoint + * the mti_unsynced_timeout sets to the current_time + mti_autosync_period. + * The time value is represented in a suitable system-dependent form, for + * example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC). + * Zero means timed auto-sync is disabled. */ + MDBX_atomic_uint64_t mti_autosync_period; + + /* Marker to distinguish uniqueness of DB/CLK. */ + MDBX_atomic_uint64_t mti_bait_uniqueness; + + alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ + +#if MDBX_ENABLE_PGOP_STAT + /* Statistics of costly ops of all (running, completed and aborted) + * transactions */ + MDBX_pgop_stat_t mti_pgop_stat; +#endif /* MDBX_ENABLE_PGOP_STAT*/ + + alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ + + /* Write transaction lock. */ +#if MDBX_LOCKING > 0 + mdbx_ipclock_t mti_wlock; +#endif /* MDBX_LOCKING > 0 */ + + atomic_txnid_t mti_oldest_reader; + + /* Timestamp of the last steady sync. Value is represented in a suitable + * system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or + * clock_gettime(CLOCK_MONOTONIC). */ + MDBX_atomic_uint64_t mti_sync_timestamp; + + /* Number un-synced-with-disk pages for auto-sync feature. */ + atomic_pgno_t mti_unsynced_pages; + + /* Number of page which was discarded last time by madvise(MADV_FREE). */ + atomic_pgno_t mti_discarded_tail; + + /* Timestamp of the last readers check. */ + MDBX_atomic_uint64_t mti_reader_check_timestamp; + + /* Shared anchor for tracking readahead edge and enabled/disabled status. */ + pgno_t mti_readahead_anchor; + + alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ + + /* Readeaders registration lock. */ +#if MDBX_LOCKING > 0 + mdbx_ipclock_t mti_rlock; +#endif /* MDBX_LOCKING > 0 */ + + /* The number of slots that have been used in the reader table. + * This always records the maximum count, it is not decremented + * when readers release their slots. */ + MDBX_atomic_uint32_t mti_numreaders; + MDBX_atomic_uint32_t mti_readers_refresh_flag; + +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ + (!defined(__cplusplus) && defined(_MSC_VER)) + alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/ + MDBX_reader mti_readers[] /* dynamic size */; +#endif /* C99 */ +} MDBX_lockinfo; + +/* Lockfile format signature: version, features and field layout */ +#define MDBX_LOCK_FORMAT \ + (MDBX_CLOCK_SIGN * 27733 + (unsigned)sizeof(MDBX_reader) * 13 + \ + (unsigned)offsetof(MDBX_reader, mr_snapshot_pages_used) * 251 + \ + (unsigned)offsetof(MDBX_lockinfo, mti_oldest_reader) * 83 + \ + (unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 + \ + (unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29) + +#define MDBX_DATA_MAGIC \ + ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION) +#define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255) + +#define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION) + +/* The maximum size of a database page. + * + * It is 64K, but value-PAGEHDRSZ must fit in MDBX_page.mp_upper. + * + * MDBX will use database pages < OS pages if needed. + * That causes more I/O in write transactions: The OS must + * know (read) the whole page before writing a partial page. + * + * Note that we don't currently support Huge pages. On Linux, + * regular data files cannot use Huge pages, and in general + * Huge pages aren't actually pageable. We rely on the OS + * demand-pager to read our data and page it out when memory + * pressure from other processes is high. So until OSs have + * actual paging support for Huge pages, they're not viable. */ +#define MAX_PAGESIZE MDBX_MAX_PAGESIZE +#define MIN_PAGESIZE MDBX_MIN_PAGESIZE + +#define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO) +#if defined(_WIN32) || defined(_WIN64) +#define MAX_MAPSIZE32 UINT32_C(0x38000000) +#else +#define MAX_MAPSIZE32 UINT32_C(0x7f000000) +#endif +#define MAX_MAPSIZE64 (MAX_PAGENO * (uint64_t)MAX_PAGESIZE) + +#if MDBX_WORDBITS >= 64 +#define MAX_MAPSIZE MAX_MAPSIZE64 +#define MDBX_READERS_LIMIT \ + ((MAX_PAGESIZE - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader)) +#define MDBX_PGL_LIMIT ((size_t)MAX_PAGENO) +#else +#define MDBX_READERS_LIMIT 1024 +#define MAX_MAPSIZE MAX_MAPSIZE32 +#define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE) +#endif /* MDBX_WORDBITS */ + +#define MDBX_RADIXSORT_THRESHOLD 333 + +/*----------------------------------------------------------------------------*/ + +/* An PNL is an Page Number List, a sorted array of IDs. + * The first element of the array is a counter for how many actual page-numbers + * are in the list. By default PNLs are sorted in descending order, this allow + * cut off a page with lowest pgno (at the tail) just truncating the list. The + * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */ +typedef pgno_t *MDBX_PNL; + +#if MDBX_PNL_ASCENDING +#define MDBX_PNL_ORDERED(first, last) ((first) < (last)) +#define MDBX_PNL_DISORDERED(first, last) ((first) >= (last)) +#else +#define MDBX_PNL_ORDERED(first, last) ((first) > (last)) +#define MDBX_PNL_DISORDERED(first, last) ((first) <= (last)) +#endif + +/* List of txnid, only for MDBX_txn.tw.lifo_reclaimed */ +typedef txnid_t *MDBX_TXL; + +/* An Dirty-Page list item is an pgno/pointer pair. */ +typedef struct MDBX_dp { + MDBX_page *ptr; + pgno_t pgno; + union { + unsigned extra; + __anonymous_struct_extension__ struct { + unsigned multi : 1; + unsigned lru : 31; + }; + }; +} MDBX_dp; + +/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ +typedef struct MDBX_dpl { + unsigned sorted; + unsigned length; + unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */ +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ + (!defined(__cplusplus) && defined(_MSC_VER)) + MDBX_dp items[] /* dynamic size with holes at zero and after the last */; +#endif +} MDBX_dpl; + +/* PNL sizes */ +#define MDBX_PNL_GRANULATE 1024 +#define MDBX_PNL_INITIAL \ + (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) + +#define MDBX_TXL_GRANULATE 32 +#define MDBX_TXL_INITIAL \ + (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) +#define MDBX_TXL_MAX \ + ((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) + +#define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1]) +#define MDBX_PNL_SIZE(pl) ((pl)[0]) +#define MDBX_PNL_FIRST(pl) ((pl)[1]) +#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_SIZE(pl)]) +#define MDBX_PNL_BEGIN(pl) (&(pl)[1]) +#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_SIZE(pl) + 1]) + +#if MDBX_PNL_ASCENDING +#define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl) +#define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl) +#else +#define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl) +#define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl) +#endif + +#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_SIZE(pl) + 1) * sizeof(pgno_t)) +#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_SIZE(pl) == 0) + +/*----------------------------------------------------------------------------*/ +/* Internal structures */ + +/* Auxiliary DB info. + * The information here is mostly static/read-only. There is + * only a single copy of this record in the environment. */ +typedef struct MDBX_dbx { + MDBX_val md_name; /* name of the database */ + MDBX_cmp_func *md_cmp; /* function for comparing keys */ + MDBX_cmp_func *md_dcmp; /* function for comparing data items */ + size_t md_klen_min, md_klen_max; /* min/max key length for the database */ + size_t md_vlen_min, + md_vlen_max; /* min/max value/data length for the database */ +} MDBX_dbx; + +/* A database transaction. + * Every operation requires a transaction handle. */ +struct MDBX_txn { +#define MDBX_MT_SIGNATURE UINT32_C(0x93D53A31) + uint32_t mt_signature; + + /* Transaction Flags */ + /* mdbx_txn_begin() flags */ +#define MDBX_TXN_RO_BEGIN_FLAGS (MDBX_TXN_RDONLY | MDBX_TXN_RDONLY_PREPARE) +#define MDBX_TXN_RW_BEGIN_FLAGS \ + (MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY) + /* Additional flag for mdbx_sync_locked() */ +#define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) + + /* internal txn flags */ +#define MDBX_TXN_FINISHED 0x01 /* txn is finished or never began */ +#define MDBX_TXN_ERROR 0x02 /* txn is unusable after an error */ +#define MDBX_TXN_DIRTY 0x04 /* must write, even if dirty list is empty */ +#define MDBX_TXN_SPILLS 0x08 /* txn or a parent has spilled pages */ +#define MDBX_TXN_HAS_CHILD 0x10 /* txn has an MDBX_txn.mt_child */ + /* most operations on the txn are currently illegal */ +#define MDBX_TXN_BLOCKED \ + (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_HAS_CHILD) + +#define TXN_FLAGS \ + (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \ + MDBX_TXN_HAS_CHILD) + +#if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \ + ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \ + MDBX_SHRINK_ALLOWED) +#error "Oops, some flags overlapped or wrong" +#endif + uint32_t mt_flags; + + MDBX_txn *mt_parent; /* parent of a nested txn */ + /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ + MDBX_txn *mt_child; + MDBX_geo mt_geo; + /* next unallocated page */ +#define mt_next_pgno mt_geo.next + /* corresponding to the current size of datafile */ +#define mt_end_pgno mt_geo.now + + /* The ID of this transaction. IDs are integers incrementing from 1. + * Only committed write transactions increment the ID. If a transaction + * aborts, the ID may be re-used by the next writer. */ + txnid_t mt_txnid; + txnid_t mt_front; + + MDBX_env *mt_env; /* the DB environment */ + /* Array of records for each DB known in the environment. */ + MDBX_dbx *mt_dbxs; + /* Array of MDBX_db records for each known DB */ + MDBX_db *mt_dbs; + /* Array of sequence numbers for each DB handle */ + unsigned *mt_dbiseqs; + + /* Transaction DBI Flags */ +#define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ +#define DBI_STALE MDBX_DBI_STALE /* Named-DB record is older than txnID */ +#define DBI_FRESH MDBX_DBI_FRESH /* Named-DB handle opened in this txn */ +#define DBI_CREAT MDBX_DBI_CREAT /* Named-DB handle created in this txn */ +#define DBI_VALID 0x10 /* DB handle is valid, see also DB_VALID */ +#define DBI_USRVALID 0x20 /* As DB_VALID, but not set for FREE_DBI */ +#define DBI_AUDITED 0x40 /* Internal flag for accounting during audit */ + /* Array of flags for each DB */ + uint8_t *mt_dbistate; + /* Number of DB records in use, or 0 when the txn is finished. + * This number only ever increments until the txn finishes; we + * don't decrement it when individual DB handles are closed. */ + MDBX_dbi mt_numdbs; + size_t mt_owner; /* thread ID that owns this transaction */ + MDBX_canary mt_canary; + void *mt_userctx; /* User-settable context */ + + union { + struct { + /* For read txns: This thread/txn's reader table slot, or NULL. */ + MDBX_reader *reader; + } to; + struct { + /* In write txns, array of cursors for each DB */ + MDBX_cursor **cursors; + pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ + txnid_t last_reclaimed; /* ID of last used record */ +#if MDBX_ENABLE_REFUND + pgno_t loose_refund_wl /* FIXME: describe */; +#endif /* MDBX_ENABLE_REFUND */ + /* dirtylist room: Dirty array size - dirty pages visible to this txn. + * Includes ancestor txns' dirty pages not hidden by other txns' + * dirty/spilled pages. Thus commit(nested txn) has room to merge + * dirtylist into mt_parent after freeing hidden mt_parent pages. */ + unsigned dirtyroom; + /* a sequence to spilling dirty page with LRU policy */ + unsigned dirtylru; + /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ + MDBX_dpl *dirtylist; + /* The list of reclaimed txns from GC */ + MDBX_TXL lifo_reclaimed; + /* The list of pages that became unused during this transaction. */ + MDBX_PNL retired_pages; + /* The list of loose pages that became unused and may be reused + * in this transaction, linked through `mp_next`. */ + MDBX_page *loose_pages; + /* Number of loose pages (tw.loose_pages) */ + unsigned loose_count; + /* The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. */ + MDBX_PNL spill_pages; + unsigned spill_least_removed; + } tw; + }; +}; + +#if MDBX_WORDBITS >= 64 +#define CURSOR_STACK 32 +#else +#define CURSOR_STACK 24 +#endif + +struct MDBX_xcursor; + +/* Cursors are used for all DB operations. + * A cursor holds a path of (page pointer, key index) from the DB + * root to a position in the DB, plus other state. MDBX_DUPSORT + * cursors include an xcursor to the current data item. Write txns + * track their cursors and keep them up to date when data moves. + * Exception: An xcursor's pointer to a P_SUBP page can be stale. + * (A node with F_DUPDATA but no F_SUBDATA contains a subpage). */ +struct MDBX_cursor { +#define MDBX_MC_LIVE UINT32_C(0xFE05D5B1) +#define MDBX_MC_READY4CLOSE UINT32_C(0x2817A047) +#define MDBX_MC_WAIT4EOT UINT32_C(0x90E297A7) + uint32_t mc_signature; + /* The database handle this cursor operates on */ + MDBX_dbi mc_dbi; + /* Next cursor on this DB in this txn */ + MDBX_cursor *mc_next; + /* Backup of the original cursor if this cursor is a shadow */ + MDBX_cursor *mc_backup; + /* Context used for databases with MDBX_DUPSORT, otherwise NULL */ + struct MDBX_xcursor *mc_xcursor; + /* The transaction that owns this cursor */ + MDBX_txn *mc_txn; + /* The database record for this cursor */ + MDBX_db *mc_db; + /* The database auxiliary record for this cursor */ + MDBX_dbx *mc_dbx; + /* The mt_dbistate for this database */ + uint8_t *mc_dbistate; + unsigned mc_snum; /* number of pushed pages */ + unsigned mc_top; /* index of top page, normally mc_snum-1 */ + + /* Cursor state flags. */ +#define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */ +#define C_EOF 0x02 /* No more data */ +#define C_SUB 0x04 /* Cursor is a sub-cursor */ +#define C_DEL 0x08 /* last op was a cursor_del */ +#define C_UNTRACK 0x10 /* Un-track cursor when closing */ +#define C_RECLAIMING 0x20 /* GC lookup is prohibited */ +#define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */ + + /* Cursor checking flags. */ +#define C_COPYING 0x100 /* skip key-value length check (copying simplify) */ +#define C_UPDATING 0x200 /* update/rebalance pending */ +#define C_RETIRING 0x400 /* refs to child pages may be invalid */ +#define C_SKIPORD 0x800 /* don't check keys ordering */ + + unsigned mc_flags; /* see mdbx_cursor */ + MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ + indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ +}; + +/* Context for sorted-dup records. + * We could have gone to a fully recursive design, with arbitrarily + * deep nesting of sub-databases. But for now we only handle these + * levels - main DB, optional sub-DB, sorted-duplicate DB. */ +typedef struct MDBX_xcursor { + /* A sub-cursor for traversing the Dup DB */ + MDBX_cursor mx_cursor; + /* The database record for this Dup DB */ + MDBX_db mx_db; + /* The auxiliary DB record for this Dup DB */ + MDBX_dbx mx_dbx; +} MDBX_xcursor; + +typedef struct MDBX_cursor_couple { + MDBX_cursor outer; + void *mc_userctx; /* User-settable context */ + MDBX_xcursor inner; +} MDBX_cursor_couple; + +/* The database environment. */ +struct MDBX_env { + /* ----------------------------------------------------- mostly static part */ +#define MDBX_ME_SIGNATURE UINT32_C(0x9A899641) + MDBX_atomic_uint32_t me_signature; + /* Failed to update the meta page. Probably an I/O error. */ +#define MDBX_FATAL_ERROR UINT32_C(0x80000000) + /* Some fields are initialized. */ +#define MDBX_ENV_ACTIVE UINT32_C(0x20000000) + /* me_txkey is set */ +#define MDBX_ENV_TXKEY UINT32_C(0x10000000) + /* Legacy MDBX_MAPASYNC (prior v0.9) */ +#define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000) +#define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) + uint32_t me_flags; + mdbx_mmap_t me_dxb_mmap; /* The main data file */ +#define me_map me_dxb_mmap.dxb +#define me_lazy_fd me_dxb_mmap.fd + mdbx_filehandle_t me_dsync_fd; + mdbx_mmap_t me_lck_mmap; /* The lock file */ +#define me_lfd me_lck_mmap.fd + struct MDBX_lockinfo *me_lck; + + unsigned me_psize; /* DB page size, initialized from me_os_psize */ + unsigned me_leaf_nodemax; /* max size of a leaf-node */ + uint8_t me_psize2log; /* log2 of DB page size */ + int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ + uint16_t me_merge_threshold, + me_merge_threshold_gc; /* pages emptier than this are candidates for + merging */ + unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ + unsigned me_maxreaders; /* size of the reader table */ + MDBX_dbi me_maxdbs; /* size of the DB table */ + uint32_t me_pid; /* process ID of this env */ + mdbx_thread_key_t me_txkey; /* thread-key for readers */ + char *me_pathname; /* path to the DB files */ + void *me_pbuf; /* scratch area for DUPSORT put() */ + MDBX_txn *me_txn0; /* prealloc'd write transaction */ + + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ + unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + unsigned + me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ + uint32_t me_live_reader; /* have liveness lock in reader table */ + void *me_userctx; /* User-settable context */ + MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ + + struct { + unsigned dp_reserve_limit; + unsigned rp_augment_limit; + unsigned dp_limit; + unsigned dp_initial; + uint8_t dp_loose_limit; + uint8_t spill_max_denominator; + uint8_t spill_min_denominator; + uint8_t spill_parent4child_denominator; + unsigned merge_threshold_16dot16_percent; + union { + unsigned all; + /* tracks options with non-auto values but tuned by user */ + struct { + unsigned dp_limit : 1; + } non_auto; + } flags; + } me_options; + + /* struct me_dbgeo used for accepting db-geo params from user for the new + * database creation, i.e. when mdbx_env_set_geometry() was called before + * mdbx_env_open(). */ + struct { + size_t lower; /* minimal size of datafile */ + size_t upper; /* maximal size of datafile */ + size_t now; /* current size of datafile */ + size_t grow; /* step to grow datafile */ + size_t shrink; /* threshold to shrink datafile */ + } me_dbgeo; + +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + union { + key_t key; + int semid; + } me_sysv_ipc; +#endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ + + MDBX_env *me_lcklist_next; + + /* --------------------------------------------------- mostly volatile part */ + + MDBX_txn *me_txn; /* current write transaction */ + mdbx_fastmutex_t me_dbi_lock; + MDBX_dbi me_numdbs; /* number of DBs opened */ + + MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ + unsigned me_dp_reserve_len; + /* PNL of pages that became unused in a write txn */ + MDBX_PNL me_retired_pages; + +#if defined(_WIN32) || defined(_WIN64) + MDBX_srwlock me_remap_guard; + /* Workaround for LockFileEx and WriteFile multithread bug */ + CRITICAL_SECTION me_windowsbug_lock; +#else + mdbx_fastmutex_t me_remap_guard; +#endif + + /* -------------------------------------------------------------- debugging */ + +#if MDBX_DEBUG + MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ +#endif +#ifdef MDBX_USE_VALGRIND + int me_valgrind_handle; +#endif +#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) + pgno_t me_poison_edge; +#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ + +#ifndef xMDBX_DEBUG_SPILLING +#define xMDBX_DEBUG_SPILLING 0 +#endif +#if xMDBX_DEBUG_SPILLING == 2 + unsigned debug_dirtied_est, debug_dirtied_act; +#endif /* xMDBX_DEBUG_SPILLING */ + + /* ------------------------------------------------- stub for lck-less mode */ + MDBX_atomic_uint64_t + x_lckless_stub[(sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) / + sizeof(MDBX_atomic_uint64_t)]; +}; + +#ifndef __cplusplus +/*----------------------------------------------------------------------------*/ +/* Debug and Logging stuff */ + +#define MDBX_RUNTIME_FLAGS_INIT \ + ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT + +extern uint8_t mdbx_runtime_flags; +extern uint8_t mdbx_loglevel; +extern MDBX_debug_func *mdbx_debug_logger; + +MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) + mdbx_debug_log(int level, const char *function, int line, const char *fmt, + ...) MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL_FUNC void mdbx_debug_log_va(int level, const char *function, + int line, const char *fmt, + va_list args); + +#define mdbx_log_enabled(msg) unlikely(msg <= mdbx_loglevel) + +#if MDBX_DEBUG + +#define mdbx_assert_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_ASSERT) + +#define mdbx_audit_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_AUDIT) + +#else /* MDBX_DEBUG */ + +#define mdbx_audit_enabled() (0) + +#if !defined(NDEBUG) || MDBX_FORCE_ASSERTIONS +#define mdbx_assert_enabled() (1) +#else +#define mdbx_assert_enabled() (0) +#endif /* NDEBUG */ + +#endif /* MDBX_DEBUG */ + +#if !MDBX_DEBUG && defined(__ANDROID_API__) +#define mdbx_assert_fail(env, msg, func, line) \ + __android_log_assert(msg, "mdbx", "%s:%u", func, line) +#else +void mdbx_assert_fail(const MDBX_env *env, const char *msg, const char *func, + int line); +#endif + +#define mdbx_debug_extra(fmt, ...) \ + do { \ + if (MDBX_DEBUG && mdbx_log_enabled(MDBX_LOG_EXTRA)) \ + mdbx_debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ + } while (0) + +#define mdbx_debug_extra_print(fmt, ...) \ + do { \ + if (MDBX_DEBUG && mdbx_log_enabled(MDBX_LOG_EXTRA)) \ + mdbx_debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ + } while (0) + +#define mdbx_trace(fmt, ...) \ + do { \ + if (MDBX_DEBUG && mdbx_log_enabled(MDBX_LOG_TRACE)) \ + mdbx_debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", \ + __VA_ARGS__); \ + } while (0) + +#define mdbx_debug(fmt, ...) \ + do { \ + if (MDBX_DEBUG && mdbx_log_enabled(MDBX_LOG_DEBUG)) \ + mdbx_debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", \ + __VA_ARGS__); \ + } while (0) + +#define mdbx_verbose(fmt, ...) \ + do { \ + if (MDBX_DEBUG && mdbx_log_enabled(MDBX_LOG_VERBOSE)) \ + mdbx_debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", \ + __VA_ARGS__); \ + } while (0) + +#define mdbx_notice(fmt, ...) \ + do { \ + if (mdbx_log_enabled(MDBX_LOG_NOTICE)) \ + mdbx_debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", \ + __VA_ARGS__); \ + } while (0) + +#define mdbx_warning(fmt, ...) \ + do { \ + if (mdbx_log_enabled(MDBX_LOG_WARN)) \ + mdbx_debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", \ + __VA_ARGS__); \ + } while (0) + +#define mdbx_error(fmt, ...) \ + do { \ + if (mdbx_log_enabled(MDBX_LOG_ERROR)) \ + mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", \ + __VA_ARGS__); \ + } while (0) + +#define mdbx_fatal(fmt, ...) \ + mdbx_debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); + +#define mdbx_ensure_msg(env, expr, msg) \ + do { \ + if (unlikely(!(expr))) \ + mdbx_assert_fail(env, msg, __func__, __LINE__); \ + } while (0) + +#define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr) + +/* assert(3) variant in environment context */ +#define mdbx_assert(env, expr) \ + do { \ + if (mdbx_assert_enabled()) \ + mdbx_ensure(env, expr); \ + } while (0) + +/* assert(3) variant in cursor context */ +#define mdbx_cassert(mc, expr) mdbx_assert((mc)->mc_txn->mt_env, expr) + +/* assert(3) variant in transaction context */ +#define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) + +#ifndef xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#undef assert +#define assert(expr) mdbx_assert(NULL, expr) +#endif + +/*----------------------------------------------------------------------------*/ +/* Cache coherence and mmap invalidation */ + +#if MDBX_CPU_WRITEBACK_INCOHERENT +#define mdbx_flush_incoherent_cpu_writeback() mdbx_memory_barrier() +#else +#define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier() +#endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ + +MDBX_MAYBE_UNUSED static __inline void +mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +#if MDBX_MMAP_INCOHERENT_FILE_WRITE + char *const begin = (char *)(-pagesize & (intptr_t)addr); + char *const end = + (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1)); + int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0; + mdbx_assert(nullptr, err == 0); + (void)err; +#else + (void)pagesize; +#endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ + +#if MDBX_MMAP_INCOHERENT_CPU_CACHE +#ifdef DCACHE + /* MIPS has cache coherency issues. + * Note: for any nbytes >= on-chip cache size, entire is flushed. */ + cacheflush(addr, nbytes, DCACHE); +#else +#error "Oops, cacheflush() not available" +#endif /* DCACHE */ +#endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ + +#if !MDBX_MMAP_INCOHERENT_FILE_WRITE && !MDBX_MMAP_INCOHERENT_CPU_CACHE + (void)addr; + (void)nbytes; +#endif +} + +/*----------------------------------------------------------------------------*/ +/* Internal prototypes */ + +MDBX_INTERNAL_FUNC int mdbx_cleanup_dead_readers(MDBX_env *env, int rlocked, + int *dead); +MDBX_INTERNAL_FUNC int mdbx_rthc_alloc(mdbx_thread_key_t *key, + MDBX_reader *begin, MDBX_reader *end); +MDBX_INTERNAL_FUNC void mdbx_rthc_remove(const mdbx_thread_key_t key); + +MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void); +MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void); +MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); + +MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) { +#if MDBX_DEBUG + if (MDBX_DBG_JITTER & mdbx_runtime_flags) + mdbx_osal_jitter(tiny); +#else + (void)tiny; +#endif +} + +#endif /* !__cplusplus */ + +#define MDBX_IS_ERROR(rc) \ + ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE) + +/* Internal error codes, not exposed outside libmdbx */ +#define MDBX_NO_ROOT (MDBX_LAST_ADDED_ERRCODE + 10) + +/* Debugging output value of a cursor DBI: Negative in a sub-cursor. */ +#define DDBI(mc) \ + (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) + +/* Key size which fits in a DKBUF (debug key buffer). */ +#define DKBUF_MAX 511 +#define DKBUF char _kbuf[DKBUF_MAX * 4 + 2] +#define DKEY(x) mdbx_dump_val(x, _kbuf, DKBUF_MAX * 2 + 1) +#define DVAL(x) mdbx_dump_val(x, _kbuf + DKBUF_MAX * 2 + 1, DKBUF_MAX * 2 + 1) + +#if MDBX_DEBUG +#define DKBUF_DEBUG DKBUF +#define DKEY_DEBUG(x) DKEY(x) +#define DVAL_DEBUG(x) DVAL(x) +#else +#define DKBUF_DEBUG ((void)(0)) +#define DKEY_DEBUG(x) ("-") +#define DVAL_DEBUG(x) ("-") +#endif + +/* An invalid page number. + * Mainly used to denote an empty tree. */ +#define P_INVALID (~(pgno_t)0) + +/* Test if the flags f are set in a flag word w. */ +#define F_ISSET(w, f) (((w) & (f)) == (f)) + +/* Round n up to an even number. */ +#define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ + +/* Default size of memory map. + * This is certainly too small for any actual applications. Apps should + * always set the size explicitly using mdbx_env_set_geometry(). */ +#define DEFAULT_MAPSIZE MEGABYTE + +/* Number of slots in the reader table. + * This value was chosen somewhat arbitrarily. The 61 is a prime number, + * and such readers plus a couple mutexes fit into single 4KB page. + * Applications should set the table size using mdbx_env_set_maxreaders(). */ +#define DEFAULT_READERS 61 + +/* Test if a page is a leaf page */ +#define IS_LEAF(p) (((p)->mp_flags & P_LEAF) != 0) +/* Test if a page is a LEAF2 page */ +#define IS_LEAF2(p) unlikely(((p)->mp_flags & P_LEAF2) != 0) +/* Test if a page is a branch page */ +#define IS_BRANCH(p) (((p)->mp_flags & P_BRANCH) != 0) +/* Test if a page is an overflow page */ +#define IS_OVERFLOW(p) unlikely(((p)->mp_flags & P_OVERFLOW) != 0) +/* Test if a page is a sub page */ +#define IS_SUBP(p) (((p)->mp_flags & P_SUBP) != 0) + +#define PAGETYPE(p) ((p)->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW)) + +/* Header for a single key/data pair within a page. + * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. + * We guarantee 2-byte alignment for 'MDBX_node's. + * + * Leaf node flags describe node contents. F_BIGDATA says the node's + * data part is the page number of an overflow page with actual data. + * F_DUPDATA and F_SUBDATA can be combined giving duplicate data in + * a sub-page/sub-database, and named databases (just F_SUBDATA). */ +typedef struct MDBX_node { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + union { + uint32_t mn_dsize; + uint32_t mn_pgno32; + }; + uint8_t mn_flags; /* see mdbx_node flags */ + uint8_t mn_extra; + uint16_t mn_ksize; /* key size */ +#else + uint16_t mn_ksize; /* key size */ + uint8_t mn_extra; + uint8_t mn_flags; /* see mdbx_node flags */ + union { + uint32_t mn_pgno32; + uint32_t mn_dsize; + }; +#endif /* __BYTE_ORDER__ */ + + /* mdbx_node Flags */ +#define F_BIGDATA 0x01 /* data put on overflow page */ +#define F_SUBDATA 0x02 /* data is a sub-database */ +#define F_DUPDATA 0x04 /* data has duplicates */ + + /* valid flags for mdbx_node_add() */ +#define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDBX_RESERVE | MDBX_APPEND) + +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ + (!defined(__cplusplus) && defined(_MSC_VER)) + uint8_t mn_data[] /* key and data are appended here */; +#endif /* C99 */ +} MDBX_node; + +#define DB_PERSISTENT_FLAGS \ + (MDBX_REVERSEKEY | MDBX_DUPSORT | MDBX_INTEGERKEY | MDBX_DUPFIXED | \ + MDBX_INTEGERDUP | MDBX_REVERSEDUP) + +/* mdbx_dbi_open() flags */ +#define DB_USABLE_FLAGS (DB_PERSISTENT_FLAGS | MDBX_CREATE | MDBX_DB_ACCEDE) + +#define DB_VALID 0x8000 /* DB handle is valid, for me_dbflags */ +#define DB_INTERNAL_FLAGS DB_VALID + +#if DB_INTERNAL_FLAGS & DB_USABLE_FLAGS +#error "Oops, some flags overlapped or wrong" +#endif +#if DB_PERSISTENT_FLAGS & ~DB_USABLE_FLAGS +#error "Oops, some flags overlapped or wrong" +#endif + +/* max number of pages to commit in one writev() call */ +#define MDBX_COMMIT_PAGES 64 +#if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */ +#undef MDBX_COMMIT_PAGES +#define MDBX_COMMIT_PAGES IOV_MAX +#endif + +/* + * / + * | -1, a < b + * CMP2INT(a,b) = < 0, a == b + * | 1, a > b + * \ + */ +#if 1 +/* LY: fast enough on most systems */ +#define CMP2INT(a, b) (((b) > (a)) ? -1 : (a) > (b)) +#else +#define CMP2INT(a, b) (((a) > (b)) - ((b) > (a))) +#endif + +/* Do not spill pages to disk if txn is getting full, may fail instead */ +#define MDBX_NOSPILL 0x8000 + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t +pgno_add(pgno_t base, pgno_t augend) { + assert(base <= MAX_PAGENO); + return (augend < MAX_PAGENO - base) ? base + augend : MAX_PAGENO; +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t +pgno_sub(pgno_t base, pgno_t subtrahend) { + assert(base >= MIN_PAGENO); + return (subtrahend < base - MIN_PAGENO) ? base - subtrahend : MIN_PAGENO; +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool +is_powerof2(size_t x) { + return (x & (x - 1)) == 0; +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t +floor_powerof2(size_t value, size_t granularity) { + assert(is_powerof2(granularity)); + return value & ~(granularity - 1); +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t +ceil_powerof2(size_t value, size_t granularity) { + return floor_powerof2(value + granularity - 1, granularity); +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned +log2n_powerof2(size_t value) { + assert(value > 0 && value < INT32_MAX && is_powerof2(value)); + assert((value & -(int32_t)value) == value); +#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl) + return __builtin_ctzl(value); +#elif defined(_MSC_VER) + unsigned long index; + _BitScanForward(&index, (unsigned long)value); + return index; +#else + static const uint8_t debruijn_ctz32[32] = { + 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, + 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; + return debruijn_ctz32[(uint32_t)(value * 0x077CB531u) >> 27]; +#endif +} + +/* Only a subset of the mdbx_env flags can be changed + * at runtime. Changing other flags requires closing the + * environment and re-opening it with the new flags. */ +#define ENV_CHANGEABLE_FLAGS \ + (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ + MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE) +#define ENV_CHANGELESS_FLAGS \ + (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ + MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) +#define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) + +#if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS +MDBX_MAYBE_UNUSED static void static_checks(void) { + STATIC_ASSERT_MSG(INT16_MAX - CORE_DBS == MDBX_MAX_DBI, + "Oops, MDBX_MAX_DBI or CORE_DBS?"); + STATIC_ASSERT_MSG((unsigned)(MDBX_DB_ACCEDE | MDBX_CREATE) == + ((DB_USABLE_FLAGS | DB_INTERNAL_FLAGS) & + (ENV_USABLE_FLAGS | ENV_INTERNAL_FLAGS)), + "Oops, some flags overlapped or wrong"); + STATIC_ASSERT_MSG((ENV_INTERNAL_FLAGS & ENV_USABLE_FLAGS) == 0, + "Oops, some flags overlapped or wrong"); +} +#endif /* Disabled for MSVC 19.0 (VisualStudio 2015) */ + +#ifdef __cplusplus +} +#endif +/* + * Copyright 2015-2021 Leonid Yuriev . + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * This code is derived from "LMDB engine" written by + * Howard Chu (Symas Corporation), which itself derived from btree.c + * written by Martin Hedenfalk. + * + * --- + * + * Portions Copyright 2011-2015 Howard Chu, Symas Corp. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + * + * --- + * + * Portions Copyright (c) 2009, 2010 Martin Hedenfalk + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + + +/*------------------------------------------------------------------------------ + * Internal inline functions */ + +MDBX_NOTHROW_CONST_FUNCTION static unsigned branchless_abs(int value) { + assert(value > INT_MIN); + const unsigned expanded_sign = + (unsigned)(value >> (sizeof(value) * CHAR_BIT - 1)); + return ((unsigned)value + expanded_sign) ^ expanded_sign; +} + +/* Pack/Unpack 16-bit values for Grow step & Shrink threshold */ +MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t me2v(unsigned m, + unsigned e) { + assert(m < 2048 && e < 8); + return (pgno_t)(32768 + ((m + 1) << (e + 8))); +} + +MDBX_NOTHROW_CONST_FUNCTION static __inline uint16_t v2me(size_t v, + unsigned e) { + assert(v > (e ? me2v(2047, e - 1) : 32768)); + assert(v <= me2v(2047, e)); + size_t m = (v - 32768 + ((size_t)1 << (e + 8)) - 1) >> (e + 8); + m -= m > 0; + assert(m < 2048 && e < 8); + // f e d c b a 9 8 7 6 5 4 3 2 1 0 + // 1 e e e m m m m m m m m m m m 1 + const uint16_t pv = (uint16_t)(0x8001 + (e << 12) + (m << 1)); + assert(pv != 65535); + return pv; +} + +/* Convert 16-bit packed (exponential quantized) value to number of pages */ +MDBX_NOTHROW_CONST_FUNCTION static pgno_t pv2pages(uint16_t pv) { + if ((pv & 0x8001) != 0x8001) + return pv; + if (pv == 65535) + return 65536; + // f e d c b a 9 8 7 6 5 4 3 2 1 0 + // 1 e e e m m m m m m m m m m m 1 + return me2v((pv >> 1) & 2047, (pv >> 12) & 7); +} + +/* Convert number of pages to 16-bit packed (exponential quantized) value */ +MDBX_NOTHROW_CONST_FUNCTION static uint16_t pages2pv(size_t pages) { + if (pages < 32769 || (pages < 65536 && (pages & 1) == 0)) + return (uint16_t)pages; + if (pages <= me2v(2047, 0)) + return v2me(pages, 0); + if (pages <= me2v(2047, 1)) + return v2me(pages, 1); + if (pages <= me2v(2047, 2)) + return v2me(pages, 2); + if (pages <= me2v(2047, 3)) + return v2me(pages, 3); + if (pages <= me2v(2047, 4)) + return v2me(pages, 4); + if (pages <= me2v(2047, 5)) + return v2me(pages, 5); + if (pages <= me2v(2047, 6)) + return v2me(pages, 6); + return (pages < me2v(2046, 7)) ? v2me(pages, 7) : 65533; +} + +/*------------------------------------------------------------------------------ + * Unaligned access */ + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline unsigned +field_alignment(unsigned alignment_baseline, size_t field_offset) { + unsigned merge = alignment_baseline | (unsigned)field_offset; + return merge & -(int)merge; +} + +/* read-thunk for UB-sanitizer */ +MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint8_t +peek_u8(const uint8_t *const __restrict ptr) { + return *ptr; +} + +/* write-thunk for UB-sanitizer */ +static __always_inline void poke_u8(uint8_t *const __restrict ptr, + const uint8_t v) { + *ptr = v; +} + +MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint16_t +unaligned_peek_u16(const unsigned expected_alignment, const void *const ptr) { + assert((uintptr_t)ptr % expected_alignment == 0); + if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(uint16_t)) == 0) + return *(const uint16_t *)ptr; + else { + uint16_t v; + memcpy(&v, ptr, sizeof(v)); + return v; + } +} + +static __always_inline void +unaligned_poke_u16(const unsigned expected_alignment, + void *const __restrict ptr, const uint16_t v) { + assert((uintptr_t)ptr % expected_alignment == 0); + if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(v)) == 0) + *(uint16_t *)ptr = v; + else + memcpy(ptr, &v, sizeof(v)); +} + +MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t unaligned_peek_u32( + const unsigned expected_alignment, const void *const __restrict ptr) { + assert((uintptr_t)ptr % expected_alignment == 0); + if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(uint32_t)) == 0) + return *(const uint32_t *)ptr; + else if ((expected_alignment % sizeof(uint16_t)) == 0) { + const uint16_t lo = + ((const uint16_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__]; + const uint16_t hi = + ((const uint16_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__]; + return lo | (uint32_t)hi << 16; + } else { + uint32_t v; + memcpy(&v, ptr, sizeof(v)); + return v; + } +} + +static __always_inline void +unaligned_poke_u32(const unsigned expected_alignment, + void *const __restrict ptr, const uint32_t v) { + assert((uintptr_t)ptr % expected_alignment == 0); + if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(v)) == 0) + *(uint32_t *)ptr = v; + else if ((expected_alignment % sizeof(uint16_t)) == 0) { + ((uint16_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__] = (uint16_t)v; + ((uint16_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__] = + (uint16_t)(v >> 16); + } else + memcpy(ptr, &v, sizeof(v)); +} + +MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t unaligned_peek_u64( + const unsigned expected_alignment, const void *const __restrict ptr) { + assert((uintptr_t)ptr % expected_alignment == 0); + if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(uint64_t)) == 0) + return *(const uint64_t *)ptr; + else if ((expected_alignment % sizeof(uint32_t)) == 0) { + const uint32_t lo = + ((const uint32_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__]; + const uint32_t hi = + ((const uint32_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__]; + return lo | (uint64_t)hi << 32; + } else { + uint64_t v; + memcpy(&v, ptr, sizeof(v)); + return v; + } +} + +static __always_inline void +unaligned_poke_u64(const unsigned expected_alignment, + void *const __restrict ptr, const uint64_t v) { + assert((uintptr_t)ptr % expected_alignment == 0); + if (MDBX_UNALIGNED_OK || (expected_alignment % sizeof(v)) == 0) + *(uint64_t *)ptr = v; + else if ((expected_alignment % sizeof(uint32_t)) == 0) { + ((uint32_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__] = (uint32_t)v; + ((uint32_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__] = + (uint32_t)(v >> 32); + } else + memcpy(ptr, &v, sizeof(v)); +} + +#define UNALIGNED_PEEK_8(ptr, struct, field) \ + peek_u8((const uint8_t *)(ptr) + offsetof(struct, field)) +#define UNALIGNED_POKE_8(ptr, struct, field, value) \ + poke_u8((uint8_t *)(ptr) + offsetof(struct, field), value) + +#define UNALIGNED_PEEK_16(ptr, struct, field) \ + unaligned_peek_u16(1, (const char *)(ptr) + offsetof(struct, field)) +#define UNALIGNED_POKE_16(ptr, struct, field, value) \ + unaligned_poke_u16(1, (char *)(ptr) + offsetof(struct, field), value) + +#define UNALIGNED_PEEK_32(ptr, struct, field) \ + unaligned_peek_u32(1, (const char *)(ptr) + offsetof(struct, field)) +#define UNALIGNED_POKE_32(ptr, struct, field, value) \ + unaligned_poke_u32(1, (char *)(ptr) + offsetof(struct, field), value) + +#define UNALIGNED_PEEK_64(ptr, struct, field) \ + unaligned_peek_u64(1, (const char *)(ptr) + offsetof(struct, field)) +#define UNALIGNED_POKE_64(ptr, struct, field, value) \ + unaligned_poke_u64(1, (char *)(ptr) + offsetof(struct, field), value) + +/* Get the page number pointed to by a branch node */ +MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t +node_pgno(const MDBX_node *const __restrict node) { + pgno_t pgno = UNALIGNED_PEEK_32(node, MDBX_node, mn_pgno32); + if (sizeof(pgno) > 4) + pgno |= ((uint64_t)UNALIGNED_PEEK_8(node, MDBX_node, mn_extra)) << 32; + return pgno; +} + +/* Set the page number in a branch node */ +static __always_inline void node_set_pgno(MDBX_node *const __restrict node, + pgno_t pgno) { + assert(pgno >= MIN_PAGENO && pgno <= MAX_PAGENO); + + UNALIGNED_POKE_32(node, MDBX_node, mn_pgno32, (uint32_t)pgno); + if (sizeof(pgno) > 4) + UNALIGNED_POKE_8(node, MDBX_node, mn_extra, + (uint8_t)((uint64_t)pgno >> 32)); +} + +/* Get the size of the data in a leaf node */ +MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t +node_ds(const MDBX_node *const __restrict node) { + return UNALIGNED_PEEK_32(node, MDBX_node, mn_dsize); +} + +/* Set the size of the data for a leaf node */ +static __always_inline void node_set_ds(MDBX_node *const __restrict node, + size_t size) { + assert(size < INT_MAX); + UNALIGNED_POKE_32(node, MDBX_node, mn_dsize, (uint32_t)size); +} + +/* The size of a key in a node */ +MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t +node_ks(const MDBX_node *const __restrict node) { + return UNALIGNED_PEEK_16(node, MDBX_node, mn_ksize); +} + +/* Set the size of the key for a leaf node */ +static __always_inline void node_set_ks(MDBX_node *const __restrict node, + size_t size) { + assert(size < INT16_MAX); + UNALIGNED_POKE_16(node, MDBX_node, mn_ksize, (uint16_t)size); +} + +MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint8_t +node_flags(const MDBX_node *const __restrict node) { + return UNALIGNED_PEEK_8(node, MDBX_node, mn_flags); +} + +static __always_inline void node_set_flags(MDBX_node *const __restrict node, + uint8_t flags) { + UNALIGNED_POKE_8(node, MDBX_node, mn_flags, flags); +} + +/* Size of the node header, excluding dynamic data at the end */ +#define NODESIZE offsetof(MDBX_node, mn_data) + +/* Address of the key for the node */ +MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * +node_key(const MDBX_node *const __restrict node) { + return (char *)node + NODESIZE; +} + +/* Address of the data for a node */ +MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * +node_data(const MDBX_node *const __restrict node) { + return (char *)node_key(node) + node_ks(node); +} + +/* Size of a node in a leaf page with a given key and data. + * This is node header plus key plus data size. */ +MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t +node_size_len(const size_t key_len, const size_t value_len) { + return NODESIZE + EVEN(key_len + value_len); +} +MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t +node_size(const MDBX_val *key, const MDBX_val *value) { + return node_size_len(key ? key->iov_len : 0, value ? value->iov_len : 0); +} + +MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t +peek_pgno(const void *const __restrict ptr) { + if (sizeof(pgno_t) == sizeof(uint32_t)) + return (pgno_t)unaligned_peek_u32(1, ptr); + else if (sizeof(pgno_t) == sizeof(uint64_t)) + return (pgno_t)unaligned_peek_u64(1, ptr); + else { + pgno_t pgno; + memcpy(&pgno, ptr, sizeof(pgno)); + return pgno; + } +} + +static __always_inline void poke_pgno(void *const __restrict ptr, + const pgno_t pgno) { + if (sizeof(pgno) == sizeof(uint32_t)) + unaligned_poke_u32(1, ptr, pgno); + else if (sizeof(pgno) == sizeof(uint64_t)) + unaligned_poke_u64(1, ptr, pgno); + else + memcpy(ptr, &pgno, sizeof(pgno)); +} + +MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t +node_largedata_pgno(const MDBX_node *const __restrict node) { + assert(node_flags(node) & F_BIGDATA); + return peek_pgno(node_data(node)); +} + +/*------------------------------------------------------------------------------ + * Nodes, Keys & Values length limitation factors: + * + * BRANCH_NODE_MAX + * Branch-page must contain at least two nodes, within each a key and a child + * page number. But page can't be splitted if it contains less that 4 keys, + * i.e. a page should not overflow before adding the fourth key. Therefore, + * at least 3 branch-node should fit in the single branch-page. Further, the + * first node of a branch-page doesn't contain a key, i.e. the first node + * is always require space just for itself. Thus: + * PAGEROOM = pagesize - page_hdr_len; + * BRANCH_NODE_MAX = even_floor( + * (PAGEROOM - sizeof(indx_t) - NODESIZE) / (3 - 1) - sizeof(indx_t)); + * KEYLEN_MAX = BRANCH_NODE_MAX - node_hdr_len; + * + * LEAF_NODE_MAX + * Leaf-node must fit into single leaf-page, where a value could be placed on + * a large/overflow page. However, may require to insert a nearly page-sized + * node between two large nodes are already fill-up a page. In this case the + * page must be splitted to two if some pair of nodes fits on one page, or + * otherwise the page should be splitted to the THREE with a single node + * per each of ones. Such 1-into-3 page splitting is costly and complex since + * requires TWO insertion into the parent page, that could lead to split it + * and so on up to the root. Therefore double-splitting is avoided here and + * the maximum node size is half of a leaf page space: + * LEAF_NODE_MAX = even_floor(PAGEROOM / 2 - sizeof(indx_t)); + * DATALEN_NO_OVERFLOW = LEAF_NODE_MAX - KEYLEN_MAX; + * + * - SubDatabase-node must fit into one leaf-page: + * SUBDB_NAME_MAX = LEAF_NODE_MAX - node_hdr_len - sizeof(MDBX_db); + * + * - Dupsort values itself are a keys in a dupsort-subdb and couldn't be longer + * than the KEYLEN_MAX. But dupsort node must not great than LEAF_NODE_MAX, + * since dupsort value couldn't be placed on a large/overflow page: + * DUPSORT_DATALEN_MAX = min(KEYLEN_MAX, + * max(DATALEN_NO_OVERFLOW, sizeof(MDBX_db)); + */ + +#define PAGEROOM(pagesize) ((pagesize)-PAGEHDRSZ) +#define EVEN_FLOOR(n) ((n) & ~(size_t)1) +#define BRANCH_NODE_MAX(pagesize) \ + (EVEN_FLOOR((PAGEROOM(pagesize) - sizeof(indx_t) - NODESIZE) / (3 - 1) - \ + sizeof(indx_t))) +#define LEAF_NODE_MAX(pagesize) \ + (EVEN_FLOOR(PAGEROOM(pagesize) / 2) - sizeof(indx_t)) +#define MAX_GC1OVPAGE(pagesize) (PAGEROOM(pagesize) / sizeof(pgno_t) - 1) + +static __inline unsigned keysize_max(size_t pagesize, MDBX_db_flags_t flags) { + assert(pagesize >= MIN_PAGESIZE && pagesize <= MAX_PAGESIZE && + is_powerof2(pagesize)); + STATIC_ASSERT(BRANCH_NODE_MAX(MIN_PAGESIZE) - NODESIZE >= 8); + if (flags & MDBX_INTEGERKEY) + return 8 /* sizeof(uint64_t) */; + + const intptr_t max_branch_key = BRANCH_NODE_MAX(pagesize) - NODESIZE; + STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) - NODESIZE - + /* sizeof(uint64) as a key */ 8 > + sizeof(MDBX_db)); + if (flags & + (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP | MDBX_INTEGERDUP)) { + const intptr_t max_dupsort_leaf_key = + LEAF_NODE_MAX(pagesize) - NODESIZE - sizeof(MDBX_db); + return (max_branch_key < max_dupsort_leaf_key) + ? (unsigned)max_branch_key + : (unsigned)max_dupsort_leaf_key; + } + return (unsigned)max_branch_key; +} + +static __inline size_t valsize_max(size_t pagesize, MDBX_db_flags_t flags) { + assert(pagesize >= MIN_PAGESIZE && pagesize <= MAX_PAGESIZE && + is_powerof2(pagesize)); + + if (flags & MDBX_INTEGERDUP) + return 8 /* sizeof(uint64_t) */; + + if (flags & (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP)) + return keysize_max(pagesize, 0); + + const unsigned page_ln2 = log2n_powerof2(pagesize); + const size_t hard = 0x7FF00000ul; + const size_t hard_pages = hard >> page_ln2; + STATIC_ASSERT(MDBX_PGL_LIMIT <= MAX_PAGENO); + const size_t pages_limit = MDBX_PGL_LIMIT / 4; + const size_t limit = + (hard_pages < pages_limit) ? hard : (pages_limit << page_ln2); + return (limit < MAX_MAPSIZE / 2) ? limit : MAX_MAPSIZE / 2; +} + +__cold int mdbx_env_get_maxkeysize(const MDBX_env *env) { + return mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT); +} + +__cold int mdbx_env_get_maxkeysize_ex(const MDBX_env *env, + MDBX_db_flags_t flags) { + if (unlikely(!env || env->me_signature.weak != MDBX_ME_SIGNATURE)) + return -1; + + return (int)mdbx_limits_keysize_max((intptr_t)env->me_psize, flags); +} + +size_t mdbx_default_pagesize(void) { + size_t pagesize = mdbx_syspagesize(); + mdbx_ensure(nullptr, is_powerof2(pagesize)); + pagesize = (pagesize >= MIN_PAGESIZE) ? pagesize : MIN_PAGESIZE; + pagesize = (pagesize <= MAX_PAGESIZE) ? pagesize : MAX_PAGESIZE; + return pagesize; +} + +__cold intptr_t mdbx_limits_keysize_max(intptr_t pagesize, + MDBX_db_flags_t flags) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_default_pagesize(); + if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || + pagesize > (intptr_t)MAX_PAGESIZE || + !is_powerof2((size_t)pagesize))) + return -1; + + return keysize_max(pagesize, flags); +} + +__cold int mdbx_env_get_maxvalsize_ex(const MDBX_env *env, + MDBX_db_flags_t flags) { + if (unlikely(!env || env->me_signature.weak != MDBX_ME_SIGNATURE)) + return -1; + + return (int)mdbx_limits_valsize_max((intptr_t)env->me_psize, flags); +} + +__cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize, + MDBX_db_flags_t flags) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_default_pagesize(); + if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || + pagesize > (intptr_t)MAX_PAGESIZE || + !is_powerof2((size_t)pagesize))) + return -1; + + return valsize_max(pagesize, flags); +} + +/* Calculate the size of a leaf node. + * + * The size depends on the environment's page size; if a data item + * is too large it will be put onto an overflow page and the node + * size will only include the key and not the data. Sizes are always + * rounded up to an even number of bytes, to guarantee 2-byte alignment + * of the MDBX_node headers. */ +MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t +leaf_size(const MDBX_env *env, const MDBX_val *key, const MDBX_val *data) { + size_t node_bytes = node_size(key, data); + if (node_bytes > env->me_leaf_nodemax) { + /* put on overflow page */ + node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t); + } + + return node_bytes + sizeof(indx_t); +} + +/* Calculate the size of a branch node. + * + * The size should depend on the environment's page size but since + * we currently don't support spilling large keys onto overflow + * pages, it's simply the size of the MDBX_node header plus the + * size of the key. Sizes are always rounded up to an even number + * of bytes, to guarantee 2-byte alignment of the MDBX_node headers. + * + * [in] env The environment handle. + * [in] key The key for the node. + * + * Returns The number of bytes needed to store the node. */ +MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t +branch_size(const MDBX_env *env, const MDBX_val *key) { + /* Size of a node in a branch page with a given key. + * This is just the node header plus the key, there is no data. */ + size_t node_bytes = node_size(key, nullptr); + if (unlikely(node_bytes > env->me_leaf_nodemax)) { + /* put on overflow page */ + /* not implemented */ + mdbx_assert_fail(env, "INDXSIZE(key) <= env->me_nodemax", __func__, + __LINE__); + node_bytes = node_size(key, nullptr) + sizeof(pgno_t); + } + + return node_bytes + sizeof(indx_t); +} + +MDBX_NOTHROW_CONST_FUNCTION static __always_inline uint16_t +flags_db2sub(uint16_t db_flags) { + uint16_t sub_flags = db_flags & MDBX_DUPFIXED; + + /* MDBX_INTEGERDUP => MDBX_INTEGERKEY */ +#define SHIFT_INTEGERDUP_TO_INTEGERKEY 2 + STATIC_ASSERT((MDBX_INTEGERDUP >> SHIFT_INTEGERDUP_TO_INTEGERKEY) == + MDBX_INTEGERKEY); + sub_flags |= (db_flags & MDBX_INTEGERDUP) >> SHIFT_INTEGERDUP_TO_INTEGERKEY; + + /* MDBX_REVERSEDUP => MDBX_REVERSEKEY */ +#define SHIFT_REVERSEDUP_TO_REVERSEKEY 5 + STATIC_ASSERT((MDBX_REVERSEDUP >> SHIFT_REVERSEDUP_TO_REVERSEKEY) == + MDBX_REVERSEKEY); + sub_flags |= (db_flags & MDBX_REVERSEDUP) >> SHIFT_REVERSEDUP_TO_REVERSEKEY; + + return sub_flags; +} + +/*----------------------------------------------------------------------------*/ + +MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t +pgno2bytes(const MDBX_env *env, pgno_t pgno) { + mdbx_assert(env, (1u << env->me_psize2log) == env->me_psize); + return ((size_t)pgno) << env->me_psize2log; +} + +MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_page * +pgno2page(const MDBX_env *env, pgno_t pgno) { + return (MDBX_page *)(env->me_map + pgno2bytes(env, pgno)); +} + +MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t +bytes2pgno(const MDBX_env *env, size_t bytes) { + mdbx_assert(env, (env->me_psize >> env->me_psize2log) == 1); + return (pgno_t)(bytes >> env->me_psize2log); +} + +MDBX_NOTHROW_PURE_FUNCTION static size_t +pgno_align2os_bytes(const MDBX_env *env, pgno_t pgno) { + return ceil_powerof2(pgno2bytes(env, pgno), env->me_os_psize); +} + +MDBX_NOTHROW_PURE_FUNCTION static pgno_t pgno_align2os_pgno(const MDBX_env *env, + pgno_t pgno) { + return bytes2pgno(env, pgno_align2os_bytes(env, pgno)); +} + +MDBX_NOTHROW_PURE_FUNCTION static size_t +bytes_align2os_bytes(const MDBX_env *env, size_t bytes) { + return ceil_powerof2(ceil_powerof2(bytes, env->me_psize), env->me_os_psize); +} + +/* Address of first usable data byte in a page, after the header */ +MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * +page_data(const MDBX_page *mp) { + return (char *)mp + PAGEHDRSZ; +} + +MDBX_NOTHROW_PURE_FUNCTION static __always_inline const MDBX_page * +data_page(const void *data) { + return container_of(data, MDBX_page, mp_ptrs); +} + +MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_meta * +page_meta(MDBX_page *mp) { + return (MDBX_meta *)page_data(mp); +} + +/* Number of nodes on a page */ +MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned +page_numkeys(const MDBX_page *mp) { + return mp->mp_lower >> 1; +} + +/* The amount of space remaining in the page */ +MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned +page_room(const MDBX_page *mp) { + return mp->mp_upper - mp->mp_lower; +} + +MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned +page_space(const MDBX_env *env) { + STATIC_ASSERT(PAGEHDRSZ % 2 == 0); + return env->me_psize - PAGEHDRSZ; +} + +MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned +page_used(const MDBX_env *env, const MDBX_page *mp) { + return page_space(env) - page_room(mp); +} + +/* The percentage of space used in the page, in a percents. */ +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __inline double +page_fill(const MDBX_env *env, const MDBX_page *mp) { + return page_used(env, mp) * 100.0 / page_space(env); +} + +/* The number of overflow pages needed to store the given size. */ +MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t +number_of_ovpages(const MDBX_env *env, size_t bytes) { + return bytes2pgno(env, PAGEHDRSZ - 1 + bytes) + 1; +} + +__cold static int MDBX_PRINTF_ARGS(2, 3) + bad_page(const MDBX_page *mp, const char *fmt, ...) { + if (mdbx_log_enabled(MDBX_LOG_ERROR)) { + static const MDBX_page *prev; + if (prev != mp) { + prev = mp; + const char *type; + switch (mp->mp_flags & (P_BRANCH | P_LEAF | P_OVERFLOW | P_META | + P_LEAF2 | P_BAD | P_SUBP)) { + case P_BRANCH: + type = "branch"; + break; + case P_LEAF: + type = "leaf"; + break; + case P_LEAF | P_SUBP: + type = "subleaf"; + break; + case P_LEAF | P_LEAF2: + type = "dupfixed-leaf"; + break; + case P_LEAF | P_LEAF2 | P_SUBP: + type = "dupfixed-subleaf"; + break; + case P_OVERFLOW: + type = "large"; + break; + default: + type = "broken"; + } + mdbx_debug_log(MDBX_LOG_ERROR, "badpage", 0, + "corrupted %s-page #%u, mod-txnid %" PRIaTXN "\n", type, + mp->mp_pgno, mp->mp_txnid); + } + + va_list args; + va_start(args, fmt); + mdbx_debug_log_va(MDBX_LOG_ERROR, "badpage", 0, fmt, args); + va_end(args); + } + return MDBX_CORRUPTED; +} + +/* Address of node i in page p */ +MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_node * +page_node(const MDBX_page *mp, unsigned i) { + assert((mp->mp_flags & (P_LEAF2 | P_OVERFLOW | P_META)) == 0); + assert(page_numkeys(mp) > (unsigned)(i)); + assert(mp->mp_ptrs[i] % 2 == 0); + return (MDBX_node *)((char *)mp + mp->mp_ptrs[i] + PAGEHDRSZ); +} + +/* The address of a key in a LEAF2 page. + * LEAF2 pages are used for MDBX_DUPFIXED sorted-duplicate sub-DBs. + * There are no node headers, keys are stored contiguously. */ +MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * +page_leaf2key(const MDBX_page *mp, unsigned i, size_t keysize) { + assert((mp->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_META)) == + (P_LEAF | P_LEAF2)); + assert(mp->mp_leaf2_ksize == keysize); + (void)keysize; + return (char *)mp + PAGEHDRSZ + (i * mp->mp_leaf2_ksize); +} + +/* Set the node's key into keyptr. */ +static __always_inline void get_key(const MDBX_node *node, MDBX_val *keyptr) { + keyptr->iov_len = node_ks(node); + keyptr->iov_base = node_key(node); +} + +/* Set the node's key into keyptr, if requested. */ +static __always_inline void +get_key_optional(const MDBX_node *node, MDBX_val *keyptr /* __may_null */) { + if (keyptr) + get_key(node, keyptr); +} + +/*------------------------------------------------------------------------------ + * Workaround for mmaped-lookahead-cross-page-boundary bug + * in an obsolete versions of Elbrus's libc and kernels. */ +#if defined(__e2k__) && defined(MDBX_E2K_MLHCPB_WORKAROUND) && \ + MDBX_E2K_MLHCPB_WORKAROUND +int __hot mdbx_e2k_memcmp_bug_workaround(const void *s1, const void *s2, + size_t n) { + if (unlikely(n > 42 + /* LY: align followed access if reasonable possible */ + && (((uintptr_t)s1) & 7) != 0 && + (((uintptr_t)s1) & 7) == (((uintptr_t)s2) & 7))) { + if (((uintptr_t)s1) & 1) { + const int diff = *(uint8_t *)s1 - *(uint8_t *)s2; + if (diff) + return diff; + s1 = (char *)s1 + 1; + s2 = (char *)s2 + 1; + n -= 1; + } + + if (((uintptr_t)s1) & 2) { + const uint16_t a = *(uint16_t *)s1; + const uint16_t b = *(uint16_t *)s2; + if (likely(a != b)) + return (__builtin_bswap16(a) > __builtin_bswap16(b)) ? 1 : -1; + s1 = (char *)s1 + 2; + s2 = (char *)s2 + 2; + n -= 2; + } + + if (((uintptr_t)s1) & 4) { + const uint32_t a = *(uint32_t *)s1; + const uint32_t b = *(uint32_t *)s2; + if (likely(a != b)) + return (__builtin_bswap32(a) > __builtin_bswap32(b)) ? 1 : -1; + s1 = (char *)s1 + 4; + s2 = (char *)s2 + 4; + n -= 4; + } + } + + while (n >= 8) { + const uint64_t a = *(uint64_t *)s1; + const uint64_t b = *(uint64_t *)s2; + if (likely(a != b)) + return (__builtin_bswap64(a) > __builtin_bswap64(b)) ? 1 : -1; + s1 = (char *)s1 + 8; + s2 = (char *)s2 + 8; + n -= 8; + } + + if (n & 4) { + const uint32_t a = *(uint32_t *)s1; + const uint32_t b = *(uint32_t *)s2; + if (likely(a != b)) + return (__builtin_bswap32(a) > __builtin_bswap32(b)) ? 1 : -1; + s1 = (char *)s1 + 4; + s2 = (char *)s2 + 4; + } + + if (n & 2) { + const uint16_t a = *(uint16_t *)s1; + const uint16_t b = *(uint16_t *)s2; + if (likely(a != b)) + return (__builtin_bswap16(a) > __builtin_bswap16(b)) ? 1 : -1; + s1 = (char *)s1 + 2; + s2 = (char *)s2 + 2; + } + + return (n & 1) ? *(uint8_t *)s1 - *(uint8_t *)s2 : 0; +} + +int __hot mdbx_e2k_strcmp_bug_workaround(const char *s1, const char *s2) { + while (true) { + int diff = *(uint8_t *)s1 - *(uint8_t *)s2; + if (likely(diff != 0) || *s1 == '\0') + return diff; + s1 += 1; + s2 += 1; + } +} + +int __hot mdbx_e2k_strncmp_bug_workaround(const char *s1, const char *s2, + size_t n) { + while (n > 0) { + int diff = *(uint8_t *)s1 - *(uint8_t *)s2; + if (likely(diff != 0) || *s1 == '\0') + return diff; + s1 += 1; + s2 += 1; + n -= 1; + } + return 0; +} + +size_t __hot mdbx_e2k_strlen_bug_workaround(const char *s) { + size_t n = 0; + while (*s) { + s += 1; + n += 1; + } + return n; +} + +size_t __hot mdbx_e2k_strnlen_bug_workaround(const char *s, size_t maxlen) { + size_t n = 0; + while (maxlen > n && *s) { + s += 1; + n += 1; + } + return n; +} +#endif /* MDBX_E2K_MLHCPB_WORKAROUND */ + +/*------------------------------------------------------------------------------ + * safe read/write volatile 64-bit fields on 32-bit architectures. */ + +static __always_inline void atomic_yield(void) { +#if defined(_WIN32) || defined(_WIN64) + YieldProcessor(); +#elif defined(__ia32__) || defined(__e2k__) + __builtin_ia32_pause(); +#elif defined(__ia64__) +#if defined(__HP_cc__) || defined(__HP_aCC__) + _Asm_hint(_HINT_PAUSE); +#else + __asm__ __volatile__("hint @pause"); +#endif +#elif defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH > 6) || \ + defined(__ARM_ARCH_6K__) +#ifdef __CC_ARM + __yield(); +#else + __asm__ __volatile__("yield"); +#endif +#elif (defined(__mips64) || defined(__mips64__)) && defined(__mips_isa_rev) && \ + __mips_isa_rev >= 2 + __asm__ __volatile__("pause"); +#elif defined(__mips) || defined(__mips__) || defined(__mips64) || \ + defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \ + defined(__MWERKS__) || defined(__sgi) + __asm__ __volatile__(".word 0x00000140"); +#elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE) + sched_yield(); +#elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS) + pthread_yield(); +#endif +} + +#if MDBX_64BIT_CAS +static __always_inline bool atomic_cas64(MDBX_atomic_uint64_t *p, uint64_t c, + uint64_t v) { +#ifdef MDBX_HAVE_C11ATOMICS + STATIC_ASSERT(sizeof(long long) >= sizeof(uint64_t)); +#ifdef ATOMIC_LLONG_LOCK_FREE + STATIC_ASSERT(ATOMIC_LLONG_LOCK_FREE > 0); +#if ATOMIC_LLONG_LOCK_FREE < 2 + assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p))); +#endif /* ATOMIC_LLONG_LOCK_FREE < 2 */ +#else /* defined(ATOMIC_LLONG_LOCK_FREE) */ + assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p))); +#endif + return atomic_compare_exchange_strong(MDBX_c11a_rw(uint64_t, p), &c, v); +#elif defined(__GNUC__) || defined(__clang__) + return __sync_bool_compare_and_swap(&p->weak, c, v); +#elif defined(_MSC_VER) + return c == (uint64_t)_InterlockedCompareExchange64( + (volatile __int64 *)&p->weak, v, c); +#elif defined(__APPLE__) + return OSAtomicCompareAndSwap64Barrier(c, v, &p->weak); +#else +#error FIXME: Unsupported compiler +#endif +} +#endif /* MDBX_64BIT_CAS */ + +static __always_inline bool atomic_cas32(MDBX_atomic_uint32_t *p, uint32_t c, + uint32_t v) { +#ifdef MDBX_HAVE_C11ATOMICS + STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t)); +#ifdef ATOMIC_INT_LOCK_FREE + STATIC_ASSERT(ATOMIC_INT_LOCK_FREE > 0); +#if ATOMIC_INT_LOCK_FREE < 2 + assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); +#endif +#else + assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); +#endif + return atomic_compare_exchange_strong(MDBX_c11a_rw(uint32_t, p), &c, v); +#elif defined(__GNUC__) || defined(__clang__) + return __sync_bool_compare_and_swap(&p->weak, c, v); +#elif defined(_MSC_VER) + STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t)); + return c == + (uint32_t)_InterlockedCompareExchange((volatile long *)&p->weak, v, c); +#elif defined(__APPLE__) + return OSAtomicCompareAndSwap32Barrier(c, v, &p->weak); +#else +#error FIXME: Unsupported compiler +#endif +} + +static __always_inline uint32_t atomic_add32(MDBX_atomic_uint32_t *p, + uint32_t v) { +#ifdef MDBX_HAVE_C11ATOMICS + STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t)); +#ifdef ATOMIC_INT_LOCK_FREE + STATIC_ASSERT(ATOMIC_INT_LOCK_FREE > 0); +#if ATOMIC_INT_LOCK_FREE < 2 + assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); +#endif +#else + assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); +#endif + return atomic_fetch_add(MDBX_c11a_rw(uint32_t, p), v); +#elif defined(__GNUC__) || defined(__clang__) + return __sync_fetch_and_add(&p->weak, v); +#elif defined(_MSC_VER) + STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t)); + return (uint32_t)_InterlockedExchangeAdd((volatile long *)&p->weak, v); +#elif defined(__APPLE__) + return OSAtomicAdd32Barrier(v, &p->weak); +#else +#error FIXME: Unsupported compiler +#endif +} + +#define atomic_sub32(p, v) atomic_add32(p, 0 - (v)) + +static __always_inline uint64_t safe64_txnid_next(uint64_t txnid) { + txnid += xMDBX_TXNID_STEP; +#if !MDBX_64BIT_CAS + /* avoid overflow of low-part in safe64_reset() */ + txnid += (UINT32_MAX == (uint32_t)txnid); +#endif + return txnid; +} + +static __always_inline void safe64_reset(MDBX_atomic_uint64_t *p, + bool single_writer) { +#if !MDBX_64BIT_CAS + if (!single_writer) { + STATIC_ASSERT(xMDBX_TXNID_STEP > 1); + /* it is safe to increment low-part to avoid ABA, since xMDBX_TXNID_STEP > 1 + * and overflow was preserved in safe64_txnid_next() */ + atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */; + atomic_store32( + &p->high, UINT32_MAX, + mo_Relaxed) /* atomically make >= SAFE64_INVALID_THRESHOLD */; + atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */; + } else +#elif MDBX_64BIT_ATOMIC + /* atomically make value >= SAFE64_INVALID_THRESHOLD by 64-bit operation */ + atomic_store64(p, UINT64_MAX, + single_writer ? mo_AcquireRelease : mo_SequentialConsistency); +#else + /* atomically make value >= SAFE64_INVALID_THRESHOLD by 32-bit operation */ + atomic_store32(&p->high, UINT32_MAX, + single_writer ? mo_AcquireRelease : mo_SequentialConsistency); +#endif /* MDBX_64BIT_ATOMIC */ + assert(p->weak >= SAFE64_INVALID_THRESHOLD); + mdbx_jitter4testing(true); +} + +static __always_inline bool safe64_reset_compare(MDBX_atomic_uint64_t *p, + txnid_t compare) { + /* LY: This function is used to reset `mr_txnid` from hsr-handler in case + * the asynchronously cancellation of read transaction. Therefore, + * there may be a collision between the cleanup performed here and + * asynchronous termination and restarting of the read transaction + * in another proces/thread. In general we MUST NOT reset the `mr_txnid` + * if a new transaction was started (i.e. if `mr_txnid` was changed). */ +#if MDBX_64BIT_CAS + bool rc = atomic_cas64(p, compare, UINT64_MAX); +#else + /* LY: There is no gold ratio here since shared mutex is too costly, + * in such way we must acquire/release it for every update of mr_txnid, + * i.e. twice for each read transaction). */ + bool rc = false; + if (likely(atomic_load32(&p->low, mo_AcquireRelease) == (uint32_t)compare && + atomic_cas32(&p->high, (uint32_t)(compare >> 32), UINT32_MAX))) { + if (unlikely(atomic_load32(&p->low, mo_AcquireRelease) != + (uint32_t)compare)) + atomic_cas32(&p->high, UINT32_MAX, (uint32_t)(compare >> 32)); + else + rc = true; + } +#endif /* MDBX_64BIT_CAS */ + mdbx_jitter4testing(true); + return rc; +} + +static __always_inline void safe64_write(MDBX_atomic_uint64_t *p, + const uint64_t v) { + assert(p->weak >= SAFE64_INVALID_THRESHOLD); +#if MDBX_64BIT_ATOMIC + atomic_store64(p, v, mo_AcquireRelease); +#else /* MDBX_64BIT_ATOMIC */ + mdbx_compiler_barrier(); + /* update low-part but still value >= SAFE64_INVALID_THRESHOLD */ + atomic_store32(&p->low, (uint32_t)v, mo_Relaxed); + assert(p->weak >= SAFE64_INVALID_THRESHOLD); + mdbx_jitter4testing(true); + /* update high-part from SAFE64_INVALID_THRESHOLD to actual value */ + atomic_store32(&p->high, (uint32_t)(v >> 32), mo_AcquireRelease); +#endif /* MDBX_64BIT_ATOMIC */ + assert(p->weak == v); + mdbx_jitter4testing(true); +} + +static __always_inline uint64_t safe64_read(const MDBX_atomic_uint64_t *p) { + mdbx_jitter4testing(true); + uint64_t v = atomic_load64(p, mo_AcquireRelease); + mdbx_jitter4testing(true); + return v; +} + +#if 0 /* unused for now */ +MDBX_MAYBE_UNUSED static __always_inline bool safe64_is_valid(uint64_t v) { +#if MDBX_WORDBITS >= 64 + return v < SAFE64_INVALID_THRESHOLD; +#else + return (v >> 32) != UINT32_MAX; +#endif /* MDBX_WORDBITS */ +} + +MDBX_MAYBE_UNUSED static __always_inline bool + safe64_is_valid_ptr(const MDBX_atomic_uint64_t *p) { +#if MDBX_64BIT_ATOMIC + return atomic_load64(p, mo_AcquireRelease) < SAFE64_INVALID_THRESHOLD; +#else + return atomic_load32(&p->high, mo_AcquireRelease) != UINT32_MAX; +#endif /* MDBX_64BIT_ATOMIC */ +} +#endif /* unused for now */ + +/* non-atomic write with safety for reading a half-updated value */ +static __always_inline void safe64_update(MDBX_atomic_uint64_t *p, + const uint64_t v) { +#if MDBX_64BIT_ATOMIC + atomic_store64(p, v, mo_Relaxed); +#else + safe64_reset(p, true); + safe64_write(p, v); +#endif /* MDBX_64BIT_ATOMIC */ +} + +/* non-atomic increment with safety for reading a half-updated value */ +MDBX_MAYBE_UNUSED static +#if MDBX_64BIT_ATOMIC + __always_inline +#endif /* MDBX_64BIT_ATOMIC */ + void + safe64_inc(MDBX_atomic_uint64_t *p, const uint64_t v) { + assert(v > 0); + safe64_update(p, atomic_load64(p, mo_Relaxed) + v); +} + +/*----------------------------------------------------------------------------*/ +/* rthc (tls keys and destructors) */ + +typedef struct rthc_entry_t { + MDBX_reader *begin; + MDBX_reader *end; + mdbx_thread_key_t thr_tls_key; + bool key_valid; +} rthc_entry_t; + +#if MDBX_DEBUG +#define RTHC_INITIAL_LIMIT 1 +#else +#define RTHC_INITIAL_LIMIT 16 +#endif + +static bin128_t bootid; + +#if defined(_WIN32) || defined(_WIN64) +static CRITICAL_SECTION rthc_critical_section; +static CRITICAL_SECTION lcklist_critical_section; +#else +int __cxa_thread_atexit_impl(void (*dtor)(void *), void *obj, void *dso_symbol) + __attribute__((__weak__)); +#ifdef __APPLE__ /* FIXME: Thread-Local Storage destructors & DSO-unloading */ +int __cxa_thread_atexit_impl(void (*dtor)(void *), void *obj, + void *dso_symbol) { + (void)dtor; + (void)obj; + (void)dso_symbol; + return -1; +} +#endif /* __APPLE__ */ + +static pthread_mutex_t lcklist_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_mutex_t rthc_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t rthc_cond = PTHREAD_COND_INITIALIZER; +static mdbx_thread_key_t rthc_key; +static MDBX_atomic_uint32_t rthc_pending; + +static void __cold workaround_glibc_bug21031(void) { + /* Workaround for https://sourceware.org/bugzilla/show_bug.cgi?id=21031 + * + * Due race between pthread_key_delete() and __nptl_deallocate_tsd() + * The destructor(s) of thread-local-storage object(s) may be running + * in another thread(s) and be blocked or not finished yet. + * In such case we get a SEGFAULT after unload this library DSO. + * + * So just by yielding a few timeslices we give a chance + * to such destructor(s) for completion and avoids segfault. */ + sched_yield(); + sched_yield(); + sched_yield(); +} +#endif + +static unsigned rthc_count, rthc_limit; +static rthc_entry_t *rthc_table; +static rthc_entry_t rthc_table_static[RTHC_INITIAL_LIMIT]; + +static __inline void rthc_lock(void) { +#if defined(_WIN32) || defined(_WIN64) + EnterCriticalSection(&rthc_critical_section); +#else + mdbx_ensure(nullptr, pthread_mutex_lock(&rthc_mutex) == 0); +#endif +} + +static __inline void rthc_unlock(void) { +#if defined(_WIN32) || defined(_WIN64) + LeaveCriticalSection(&rthc_critical_section); +#else + mdbx_ensure(nullptr, pthread_mutex_unlock(&rthc_mutex) == 0); +#endif +} + +static __inline int thread_key_create(mdbx_thread_key_t *key) { + int rc; +#if defined(_WIN32) || defined(_WIN64) + *key = TlsAlloc(); + rc = (*key != TLS_OUT_OF_INDEXES) ? MDBX_SUCCESS : GetLastError(); +#else + rc = pthread_key_create(key, nullptr); +#endif + mdbx_trace("&key = %p, value %" PRIuPTR ", rc %d", + __Wpedantic_format_voidptr(key), (uintptr_t)*key, rc); + return rc; +} + +static __inline void thread_key_delete(mdbx_thread_key_t key) { + mdbx_trace("key = %" PRIuPTR, (uintptr_t)key); +#if defined(_WIN32) || defined(_WIN64) + mdbx_ensure(nullptr, TlsFree(key)); +#else + mdbx_ensure(nullptr, pthread_key_delete(key) == 0); + workaround_glibc_bug21031(); +#endif +} + +static __inline void *thread_rthc_get(mdbx_thread_key_t key) { +#if defined(_WIN32) || defined(_WIN64) + return TlsGetValue(key); +#else + return pthread_getspecific(key); +#endif +} + +static void thread_rthc_set(mdbx_thread_key_t key, const void *value) { +#if defined(_WIN32) || defined(_WIN64) + mdbx_ensure(nullptr, TlsSetValue(key, (void *)value)); +#else +#define MDBX_THREAD_RTHC_ZERO 0 +#define MDBX_THREAD_RTHC_REGISTERED 1 +#define MDBX_THREAD_RTHC_COUNTED 2 + static __thread uint32_t thread_registration_state; + if (value && unlikely(thread_registration_state == MDBX_THREAD_RTHC_ZERO)) { + thread_registration_state = MDBX_THREAD_RTHC_REGISTERED; + mdbx_trace("thread registered 0x%" PRIxPTR, mdbx_thread_self()); + if (&__cxa_thread_atexit_impl == nullptr || + __cxa_thread_atexit_impl(mdbx_rthc_thread_dtor, + &thread_registration_state, + (void *)&mdbx_version /* dso_anchor */)) { + mdbx_ensure(nullptr, pthread_setspecific( + rthc_key, &thread_registration_state) == 0); + thread_registration_state = MDBX_THREAD_RTHC_COUNTED; + const unsigned count_before = atomic_add32(&rthc_pending, 1); + mdbx_ensure(nullptr, count_before < INT_MAX); + mdbx_trace("fallback to pthreads' tsd, key %" PRIuPTR ", count %u", + (uintptr_t)rthc_key, count_before); + (void)count_before; + } + } + mdbx_ensure(nullptr, pthread_setspecific(key, value) == 0); +#endif +} + +__cold void mdbx_rthc_global_init(void) { + rthc_limit = RTHC_INITIAL_LIMIT; + rthc_table = rthc_table_static; +#if defined(_WIN32) || defined(_WIN64) + InitializeCriticalSection(&rthc_critical_section); + InitializeCriticalSection(&lcklist_critical_section); +#else + mdbx_ensure(nullptr, + pthread_key_create(&rthc_key, mdbx_rthc_thread_dtor) == 0); + mdbx_trace("pid %d, &mdbx_rthc_key = %p, value 0x%x", mdbx_getpid(), + __Wpedantic_format_voidptr(&rthc_key), (unsigned)rthc_key); +#endif + /* checking time conversion, this also avoids racing on 32-bit architectures + * during writing calculated 64-bit ratio(s) into memory. */ + uint32_t proba = UINT32_MAX; + while (true) { + unsigned time_conversion_checkup = + mdbx_osal_monotime_to_16dot16(mdbx_osal_16dot16_to_monotime(proba)); + unsigned one_more = (proba < UINT32_MAX) ? proba + 1 : proba; + unsigned one_less = (proba > 0) ? proba - 1 : proba; + mdbx_ensure(nullptr, time_conversion_checkup >= one_less && + time_conversion_checkup <= one_more); + if (proba == 0) + break; + proba >>= 1; + } + + bootid = mdbx_osal_bootid(); +#if 0 /* debug */ + for (unsigned i = 0; i < 65536; ++i) { + size_t pages = pv2pages(i); + unsigned x = pages2pv(pages); + size_t xp = pv2pages(x); + if (!(x == i || (x % 2 == 0 && x < 65536)) || pages != xp) + printf("%u => %zu => %u => %zu\n", i, pages, x, xp); + assert(pages == xp); + } + fflush(stdout); +#endif +} + +/* dtor called for thread, i.e. for all mdbx's environment objects */ +__cold void mdbx_rthc_thread_dtor(void *ptr) { + rthc_lock(); + mdbx_trace(">> pid %d, thread 0x%" PRIxPTR ", rthc %p", mdbx_getpid(), + mdbx_thread_self(), ptr); + + const uint32_t self_pid = mdbx_getpid(); + for (unsigned i = 0; i < rthc_count; ++i) { + if (!rthc_table[i].key_valid) + continue; + const mdbx_thread_key_t key = rthc_table[i].thr_tls_key; + MDBX_reader *const rthc = thread_rthc_get(key); + if (rthc < rthc_table[i].begin || rthc >= rthc_table[i].end) + continue; +#if !defined(_WIN32) && !defined(_WIN64) + if (pthread_setspecific(key, nullptr) != 0) { + mdbx_trace("== thread 0x%" PRIxPTR + ", rthc %p: ignore race with tsd-key deletion", + mdbx_thread_self(), ptr); + continue /* ignore race with tsd-key deletion by mdbx_env_close() */; + } +#endif + + mdbx_trace("== thread 0x%" PRIxPTR + ", rthc %p, [%i], %p ... %p (%+i), rtch-pid %i, " + "current-pid %i", + mdbx_thread_self(), __Wpedantic_format_voidptr(rthc), i, + __Wpedantic_format_voidptr(rthc_table[i].begin), + __Wpedantic_format_voidptr(rthc_table[i].end), + (int)(rthc - rthc_table[i].begin), rthc->mr_pid.weak, self_pid); + if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) { + mdbx_trace("==== thread 0x%" PRIxPTR ", rthc %p, cleanup", + mdbx_thread_self(), __Wpedantic_format_voidptr(rthc)); + atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease); + } + } + +#if defined(_WIN32) || defined(_WIN64) + mdbx_trace("<< thread 0x%" PRIxPTR ", rthc %p", mdbx_thread_self(), ptr); + rthc_unlock(); +#else + const char self_registration = *(char *)ptr; + *(char *)ptr = MDBX_THREAD_RTHC_ZERO; + mdbx_trace("== thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status %d", + mdbx_thread_self(), ptr, mdbx_getpid(), self_registration); + if (self_registration == MDBX_THREAD_RTHC_COUNTED) + mdbx_ensure(nullptr, atomic_sub32(&rthc_pending, 1) > 0); + + if (atomic_load32(&rthc_pending, mo_AcquireRelease) == 0) { + mdbx_trace("== thread 0x%" PRIxPTR ", rthc %p, pid %d, wake", + mdbx_thread_self(), ptr, mdbx_getpid()); + mdbx_ensure(nullptr, pthread_cond_broadcast(&rthc_cond) == 0); + } + + mdbx_trace("<< thread 0x%" PRIxPTR ", rthc %p", mdbx_thread_self(), ptr); + /* Allow tail call optimization, i.e. gcc should generate the jmp instruction + * instead of a call for pthread_mutex_unlock() and therefore CPU could not + * return to current DSO's code section, which may be unloaded immediately + * after the mutex got released. */ + pthread_mutex_unlock(&rthc_mutex); +#endif +} + +__cold void mdbx_rthc_global_dtor(void) { + mdbx_trace(">> pid %d", mdbx_getpid()); + + rthc_lock(); +#if !defined(_WIN32) && !defined(_WIN64) + char *rthc = (char *)pthread_getspecific(rthc_key); + mdbx_trace("== thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status %d", + mdbx_thread_self(), __Wpedantic_format_voidptr(rthc), + mdbx_getpid(), rthc ? *rthc : -1); + if (rthc) { + const char self_registration = *(char *)rthc; + *rthc = MDBX_THREAD_RTHC_ZERO; + if (self_registration == MDBX_THREAD_RTHC_COUNTED) + mdbx_ensure(nullptr, atomic_sub32(&rthc_pending, 1) > 0); + } + + struct timespec abstime; + mdbx_ensure(nullptr, clock_gettime(CLOCK_REALTIME, &abstime) == 0); + abstime.tv_nsec += 1000000000l / 10; + if (abstime.tv_nsec >= 1000000000l) { + abstime.tv_nsec -= 1000000000l; + abstime.tv_sec += 1; + } +#if MDBX_DEBUG > 0 + abstime.tv_sec += 600; +#endif + + for (unsigned left; + (left = atomic_load32(&rthc_pending, mo_AcquireRelease)) > 0;) { + mdbx_trace("pid %d, pending %u, wait for...", mdbx_getpid(), left); + const int rc = pthread_cond_timedwait(&rthc_cond, &rthc_mutex, &abstime); + if (rc && rc != EINTR) + break; + } + thread_key_delete(rthc_key); +#endif + + const uint32_t self_pid = mdbx_getpid(); + for (unsigned i = 0; i < rthc_count; ++i) { + if (!rthc_table[i].key_valid) + continue; + const mdbx_thread_key_t key = rthc_table[i].thr_tls_key; + thread_key_delete(key); + for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; + ++rthc) { + mdbx_trace( + "== [%i] = key %" PRIuPTR ", %p ... %p, rthc %p (%+i), " + "rthc-pid %i, current-pid %i", + i, (uintptr_t)key, __Wpedantic_format_voidptr(rthc_table[i].begin), + __Wpedantic_format_voidptr(rthc_table[i].end), + __Wpedantic_format_voidptr(rthc), (int)(rthc - rthc_table[i].begin), + rthc->mr_pid.weak, self_pid); + if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) { + atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease); + mdbx_trace("== cleanup %p", __Wpedantic_format_voidptr(rthc)); + } + } + } + + rthc_limit = rthc_count = 0; + if (rthc_table != rthc_table_static) + mdbx_free(rthc_table); + rthc_table = nullptr; + rthc_unlock(); + +#if defined(_WIN32) || defined(_WIN64) + DeleteCriticalSection(&lcklist_critical_section); + DeleteCriticalSection(&rthc_critical_section); +#else + /* LY: yielding a few timeslices to give a more chance + * to racing destructor(s) for completion. */ + workaround_glibc_bug21031(); +#endif + + mdbx_trace("<< pid %d\n", mdbx_getpid()); +} + +__cold int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDBX_reader *begin, + MDBX_reader *end) { + int rc; + if (key) { +#ifndef NDEBUG + *key = (mdbx_thread_key_t)0xBADBADBAD; +#endif /* NDEBUG */ + rc = thread_key_create(key); + if (rc != MDBX_SUCCESS) + return rc; + } + + rthc_lock(); + const mdbx_thread_key_t new_key = key ? *key : 0; + mdbx_trace(">> key %" PRIuPTR ", rthc_count %u, rthc_limit %u", + (uintptr_t)new_key, rthc_count, rthc_limit); + if (rthc_count == rthc_limit) { + rthc_entry_t *new_table = + mdbx_realloc((rthc_table == rthc_table_static) ? nullptr : rthc_table, + sizeof(rthc_entry_t) * rthc_limit * 2); + if (new_table == nullptr) { + rc = MDBX_ENOMEM; + goto bailout; + } + if (rthc_table == rthc_table_static) + memcpy(new_table, rthc_table_static, sizeof(rthc_table_static)); + rthc_table = new_table; + rthc_limit *= 2; + } + mdbx_trace("== [%i] = key %" PRIuPTR ", %p ... %p", rthc_count, + (uintptr_t)new_key, __Wpedantic_format_voidptr(begin), + __Wpedantic_format_voidptr(end)); + rthc_table[rthc_count].key_valid = key ? true : false; + rthc_table[rthc_count].thr_tls_key = key ? new_key : 0; + rthc_table[rthc_count].begin = begin; + rthc_table[rthc_count].end = end; + ++rthc_count; + mdbx_trace("<< key %" PRIuPTR ", rthc_count %u, rthc_limit %u", + (uintptr_t)new_key, rthc_count, rthc_limit); + rthc_unlock(); + return MDBX_SUCCESS; + +bailout: + if (key) + thread_key_delete(*key); + rthc_unlock(); + return rc; +} + +__cold void mdbx_rthc_remove(const mdbx_thread_key_t key) { + thread_key_delete(key); + rthc_lock(); + mdbx_trace(">> key %zu, rthc_count %u, rthc_limit %u", (size_t)key, + rthc_count, rthc_limit); + + for (unsigned i = 0; i < rthc_count; ++i) { + if (rthc_table[i].key_valid && key == rthc_table[i].thr_tls_key) { + const uint32_t self_pid = mdbx_getpid(); + mdbx_trace("== [%i], %p ...%p, current-pid %d", i, + __Wpedantic_format_voidptr(rthc_table[i].begin), + __Wpedantic_format_voidptr(rthc_table[i].end), self_pid); + + for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; + ++rthc) { + if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) { + atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease); + mdbx_trace("== cleanup %p", __Wpedantic_format_voidptr(rthc)); + } + } + if (--rthc_count > 0) + rthc_table[i] = rthc_table[rthc_count]; + else if (rthc_table != rthc_table_static) { + mdbx_free(rthc_table); + rthc_table = rthc_table_static; + rthc_limit = RTHC_INITIAL_LIMIT; + } + break; + } + } + + mdbx_trace("<< key %zu, rthc_count %u, rthc_limit %u", (size_t)key, + rthc_count, rthc_limit); + rthc_unlock(); +} + +//------------------------------------------------------------------------------ + +#define RTHC_ENVLIST_END ((MDBX_env *)((uintptr_t)50459)) +static MDBX_env *inprocess_lcklist_head = RTHC_ENVLIST_END; + +static __inline void lcklist_lock(void) { +#if defined(_WIN32) || defined(_WIN64) + EnterCriticalSection(&lcklist_critical_section); +#else + mdbx_ensure(nullptr, pthread_mutex_lock(&lcklist_mutex) == 0); +#endif +} + +static __inline void lcklist_unlock(void) { +#if defined(_WIN32) || defined(_WIN64) + LeaveCriticalSection(&lcklist_critical_section); +#else + mdbx_ensure(nullptr, pthread_mutex_unlock(&lcklist_mutex) == 0); +#endif +} + +MDBX_NOTHROW_CONST_FUNCTION static uint64_t rrxmrrxmsx_0(uint64_t v) { + /* Pelle Evensen's mixer, https://bit.ly/2HOfynt */ + v ^= (v << 39 | v >> 25) ^ (v << 14 | v >> 50); + v *= UINT64_C(0xA24BAED4963EE407); + v ^= (v << 40 | v >> 24) ^ (v << 15 | v >> 49); + v *= UINT64_C(0x9FB21C651E98DF25); + return v ^ v >> 28; +} + +static int uniq_peek(const mdbx_mmap_t *pending, mdbx_mmap_t *scan) { + int rc; + uint64_t bait; + MDBX_lockinfo *const pending_lck = pending->lck; + MDBX_lockinfo *const scan_lck = scan->lck; + if (pending_lck) { + bait = atomic_load64(&pending_lck->mti_bait_uniqueness, mo_AcquireRelease); + rc = MDBX_SUCCESS; + } else { + bait = 0 /* hush MSVC warning */; +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&scan_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = mdbx_msync(scan, 0, sizeof(MDBX_lockinfo), MDBX_SYNC_DATA); + if (rc == MDBX_SUCCESS) + rc = mdbx_pread(pending->fd, &bait, sizeof(scan_lck->mti_bait_uniqueness), + offsetof(MDBX_lockinfo, mti_bait_uniqueness)); + } + if (likely(rc == MDBX_SUCCESS) && + bait == atomic_load64(&scan_lck->mti_bait_uniqueness, mo_AcquireRelease)) + rc = MDBX_RESULT_TRUE; + + mdbx_trace("uniq-peek: %s, bait 0x%016" PRIx64 ",%s rc %d", + pending_lck ? "mem" : "file", bait, + (rc == MDBX_RESULT_TRUE) ? " found," : (rc ? " FAILED," : ""), rc); + return rc; +} + +static int uniq_poke(const mdbx_mmap_t *pending, mdbx_mmap_t *scan, + uint64_t *abra) { + if (*abra == 0) { + const uintptr_t tid = mdbx_thread_self(); + uintptr_t uit = 0; + memcpy(&uit, &tid, (sizeof(tid) < sizeof(uit)) ? sizeof(tid) : sizeof(uit)); + *abra = + rrxmrrxmsx_0(mdbx_osal_monotime() + UINT64_C(5873865991930747) * uit); + } + const uint64_t cadabra = + rrxmrrxmsx_0(*abra + UINT64_C(7680760450171793) * (unsigned)mdbx_getpid()) + << 24 | + *abra >> 40; + MDBX_lockinfo *const scan_lck = scan->lck; + atomic_store64(&scan_lck->mti_bait_uniqueness, cadabra, + mo_SequentialConsistency); + *abra = *abra * UINT64_C(6364136223846793005) + 1; + return uniq_peek(pending, scan); +} + +__cold static int uniq_check(const mdbx_mmap_t *pending, MDBX_env **found) { + *found = nullptr; + uint64_t salt = 0; + for (MDBX_env *scan = inprocess_lcklist_head; scan != RTHC_ENVLIST_END; + scan = scan->me_lcklist_next) { + MDBX_lockinfo *const scan_lck = scan->me_lck_mmap.lck; + int err = atomic_load64(&scan_lck->mti_bait_uniqueness, mo_AcquireRelease) + ? uniq_peek(pending, &scan->me_lck_mmap) + : uniq_poke(pending, &scan->me_lck_mmap, &salt); + if (err == MDBX_ENODATA) { + uint64_t length; + if (likely(mdbx_filesize(pending->fd, &length) == MDBX_SUCCESS && + length == 0)) { + /* LY: skip checking since LCK-file is empty, i.e. just created. */ + mdbx_debug("uniq-probe: %s", "unique (new/empty lck)"); + return MDBX_RESULT_TRUE; + } + } + if (err == MDBX_RESULT_TRUE) + err = uniq_poke(pending, &scan->me_lck_mmap, &salt); + if (err == MDBX_RESULT_TRUE) { +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&scan->me_lck_mmap.lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ + (void)mdbx_msync(&scan->me_lck_mmap, 0, sizeof(MDBX_lockinfo), + MDBX_SYNC_NONE); + err = uniq_poke(pending, &scan->me_lck_mmap, &salt); + } + if (err == MDBX_RESULT_TRUE) { + err = uniq_poke(pending, &scan->me_lck_mmap, &salt); + *found = scan; + mdbx_debug("uniq-probe: found %p", __Wpedantic_format_voidptr(*found)); + return MDBX_RESULT_FALSE; + } + if (unlikely(err != MDBX_SUCCESS)) { + mdbx_debug("uniq-probe: failed rc %d", err); + return err; + } + } + + mdbx_debug("uniq-probe: %s", "unique"); + return MDBX_RESULT_TRUE; +} + +static int lcklist_detach_locked(MDBX_env *env) { + MDBX_env *inprocess_neighbor = nullptr; + int rc = MDBX_SUCCESS; + if (env->me_lcklist_next != nullptr) { + mdbx_ensure(env, env->me_lcklist_next != nullptr); + mdbx_ensure(env, inprocess_lcklist_head != RTHC_ENVLIST_END); + for (MDBX_env **ptr = &inprocess_lcklist_head; *ptr != RTHC_ENVLIST_END; + ptr = &(*ptr)->me_lcklist_next) { + if (*ptr == env) { + *ptr = env->me_lcklist_next; + env->me_lcklist_next = nullptr; + break; + } + } + mdbx_ensure(env, env->me_lcklist_next == nullptr); + } + + rc = likely(mdbx_getpid() == env->me_pid) + ? uniq_check(&env->me_lck_mmap, &inprocess_neighbor) + : MDBX_PANIC; + if (!inprocess_neighbor && env->me_live_reader) + (void)mdbx_rpid_clear(env); + if (!MDBX_IS_ERROR(rc)) + rc = mdbx_lck_destroy(env, inprocess_neighbor); + return rc; +} + +/*------------------------------------------------------------------------------ + * LY: State of the art quicksort-based sorting, with internal stack + * and network-sort for small chunks. + * Thanks to John M. Gamble for the http://pages.ripco.net/~jgamble/nw.html */ + +#define SORT_CMP_SWAP(TYPE, CMP, a, b) \ + do { \ + const TYPE swap_tmp = (a); \ + const bool swap_cmp = CMP(swap_tmp, b); \ + (a) = swap_cmp ? swap_tmp : b; \ + (b) = swap_cmp ? b : swap_tmp; \ + } while (0) + +// 3 comparators, 3 parallel operations +// o-----^--^--o +// | | +// o--^--|--v--o +// | | +// o--v--v-----o +// +// [[1,2]] +// [[0,2]] +// [[0,1]] +#define SORT_NETWORK_3(TYPE, CMP, begin) \ + do { \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + } while (0) + +// 5 comparators, 3 parallel operations +// o--^--^--------o +// | | +// o--v--|--^--^--o +// | | | +// o--^--v--|--v--o +// | | +// o--v-----v-----o +// +// [[0,1],[2,3]] +// [[0,2],[1,3]] +// [[1,2]] +#define SORT_NETWORK_4(TYPE, CMP, begin) \ + do { \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ + } while (0) + +// 9 comparators, 5 parallel operations +// o--^--^-----^-----------o +// | | | +// o--|--|--^--v-----^--^--o +// | | | | | +// o--|--v--|--^--^--|--v--o +// | | | | | +// o--|-----v--|--v--|--^--o +// | | | | +// o--v--------v-----v--v--o +// +// [[0,4],[1,3]] +// [[0,2]] +// [[2,4],[0,1]] +// [[2,3],[1,4]] +// [[1,2],[3,4]] +#define SORT_NETWORK_5(TYPE, CMP, begin) \ + do { \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ + } while (0) + +// 12 comparators, 6 parallel operations +// o-----^--^--^-----------------o +// | | | +// o--^--|--v--|--^--------^-----o +// | | | | | +// o--v--v-----|--|--^--^--|--^--o +// | | | | | | +// o-----^--^--v--|--|--|--v--v--o +// | | | | | +// o--^--|--v-----v--|--v--------o +// | | | +// o--v--v-----------v-----------o +// +// [[1,2],[4,5]] +// [[0,2],[3,5]] +// [[0,1],[3,4],[2,5]] +// [[0,3],[1,4]] +// [[2,4],[1,3]] +// [[2,3]] +#define SORT_NETWORK_6(TYPE, CMP, begin) \ + do { \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + } while (0) + +// 16 comparators, 6 parallel operations +// o--^--------^-----^-----------------o +// | | | +// o--|--^-----|--^--v--------^--^-----o +// | | | | | | +// o--|--|--^--v--|--^-----^--|--v-----o +// | | | | | | | +// o--|--|--|-----v--|--^--v--|--^--^--o +// | | | | | | | | +// o--v--|--|--^-----v--|--^--v--|--v--o +// | | | | | | +// o-----v--|--|--------v--v-----|--^--o +// | | | | +// o--------v--v-----------------v--v--o +// +// [[0,4],[1,5],[2,6]] +// [[0,2],[1,3],[4,6]] +// [[2,4],[3,5],[0,1]] +// [[2,3],[4,5]] +// [[1,4],[3,6]] +// [[1,2],[3,4],[5,6]] +#define SORT_NETWORK_7(TYPE, CMP, begin) \ + do { \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ + } while (0) + +// 19 comparators, 6 parallel operations +// o--^--------^-----^-----------------o +// | | | +// o--|--^-----|--^--v--------^--^-----o +// | | | | | | +// o--|--|--^--v--|--^-----^--|--v-----o +// | | | | | | | +// o--|--|--|--^--v--|--^--v--|--^--^--o +// | | | | | | | | | +// o--v--|--|--|--^--v--|--^--v--|--v--o +// | | | | | | | +// o-----v--|--|--|--^--v--v-----|--^--o +// | | | | | | +// o--------v--|--v--|--^--------v--v--o +// | | | +// o-----------v-----v--v--------------o +// +// [[0,4],[1,5],[2,6],[3,7]] +// [[0,2],[1,3],[4,6],[5,7]] +// [[2,4],[3,5],[0,1],[6,7]] +// [[2,3],[4,5]] +// [[1,4],[3,6]] +// [[1,2],[3,4],[5,6]] +#define SORT_NETWORK_8(TYPE, CMP, begin) \ + do { \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ + } while (0) + +// 25 comparators, 9 parallel operations +// o--^-----^--^-----^-----------------------------------o +// | | | | +// o--v--^--v--|-----|--^-----^-----------^--------------o +// | | | | | | +// o-----v-----|-----|--|-----|--^-----^--|--^-----^--^--o +// | | | | | | | | | | +// o--^-----^--v--^--v--|-----|--|-----|--v--|-----|--v--o +// | | | | | | | | | +// o--v--^--v-----|-----v--^--v--|-----|-----|--^--v-----o +// | | | | | | | +// o-----v--------|--------|-----v--^--v--^--|--|--^-----o +// | | | | | | | +// o--^-----^-----v--------|--------|-----|--v--v--v-----o +// | | | | | +// o--v--^--v--------------v--------|-----v--------------o +// | | +// o-----v--------------------------v--------------------o +// +// [[0,1],[3,4],[6,7]] +// [[1,2],[4,5],[7,8]] +// [[0,1],[3,4],[6,7],[2,5]] +// [[0,3],[1,4],[5,8]] +// [[3,6],[4,7],[2,5]] +// [[0,3],[1,4],[5,7],[2,6]] +// [[1,3],[4,6]] +// [[2,4],[5,6]] +// [[2,3]] +#define SORT_NETWORK_9(TYPE, CMP, begin) \ + do { \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + } while (0) + +// 29 comparators, 9 parallel operations +// o--------------^-----^--^--^-----------------------o +// | | | | +// o-----------^--|--^--|--|--v--^--------^-----------o +// | | | | | | | +// o--------^--|--|--|--|--v--^--v-----^--|--^--------o +// | | | | | | | | | +// o-----^--|--|--|--|--v--^--|-----^--|--v--v--^-----o +// | | | | | | | | | | +// o--^--|--|--|--|--v-----|--v--^--|--|--^-----v--^--o +// | | | | | | | | | | | +// o--|--|--|--|--v--^-----|--^--|--v--v--|-----^--v--o +// | | | | | | | | | | +// o--|--|--|--v--^--|-----v--|--v--^-----|--^--v-----o +// | | | | | | | | | +// o--|--|--v-----|--|--^-----v--^--|-----v--v--------o +// | | | | | | | +// o--|--v--------|--v--|--^-----v--v-----------------o +// | | | | +// o--v-----------v-----v--v--------------------------o +// +// [[4,9],[3,8],[2,7],[1,6],[0,5]] +// [[1,4],[6,9],[0,3],[5,8]] +// [[0,2],[3,6],[7,9]] +// [[0,1],[2,4],[5,7],[8,9]] +// [[1,2],[4,6],[7,8],[3,5]] +// [[2,5],[6,8],[1,3],[4,7]] +// [[2,3],[6,7]] +// [[3,4],[5,6]] +// [[4,5]] +#define SORT_NETWORK_10(TYPE, CMP, begin) \ + do { \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ + } while (0) + +// 35 comparators, 9 parallel operations +// o--^-----^-----------------^--------^--------------------o +// | | | | +// o--v--^--|--^--^--------^--|--------|--^-----------------o +// | | | | | | | | +// o--^--|--v--v--|-----^--|--|--------|--|-----^--^--------o +// | | | | | | | | | | +// o--v--v--------|-----|--|--|--^-----|--|--^--v--|--^--^--o +// | | | | | | | | | | | +// o--^-----^-----|-----|--|--v--|--^--v--v--|-----v--|--v--o +// | | | | | | | | | +// o--v--^--|--^--v--^--|--v-----|--|--------|--------v--^--o +// | | | | | | | | | +// o--^--|--v--v--^--|--v--^-----|--|--------|--------^--v--o +// | | | | | | | | | +// o--v--v--------|--|-----|-----v--|--^-----|-----^--|--^--o +// | | | | | | | | | +// o--^--^--------|--|-----|--------v--|-----v--^--|--v--v--o +// | | | | | | | | +// o--v--|--^-----|--v-----|-----------|--------v--v--------o +// | | | | | +// o-----v--v-----v--------v-----------v--------------------o +// +// [[0,1],[2,3],[4,5],[6,7],[8,9]] +// [[1,3],[5,7],[0,2],[4,6],[8,10]] +// [[1,2],[5,6],[9,10],[0,4],[3,7]] +// [[1,5],[6,10],[4,8]] +// [[5,9],[2,6],[0,4],[3,8]] +// [[1,5],[6,10],[2,3],[8,9]] +// [[1,4],[7,10],[3,5],[6,8]] +// [[2,4],[7,9],[5,6]] +// [[3,4],[7,8]] +#define SORT_NETWORK_11(TYPE, CMP, begin) \ + do { \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ + } while (0) + +// 39 comparators, parallel operations +// o--^-----^-----------------^--------^--------------------o +// | | | | +// o--v--^--|--^--^--------^--|--------|--^-----------------o +// | | | | | | | | +// o--^--|--v--v--|-----^--|--|--------|--|-----^--^--------o +// | | | | | | | | | | +// o--v--v--------|-----|--|--|--^-----|--|--^--v--|--^--^--o +// | | | | | | | | | | | +// o--^-----^-----|-----|--|--v--|--^--v--v--|-----v--|--v--o +// | | | | | | | | | +// o--v--^--|--^--v--^--|--v-----|--|--------|--------v--^--o +// | | | | | | | | | +// o--^--|--v--v--^--|--v--^-----|--|--------|--------^--v--o +// | | | | | | | | | +// o--v--v--------|--|-----|--^--v--|--^--^--|-----^--|--^--o +// | | | | | | | | | | | +// o--^-----^-----|--|-----|--|-----v--|--|--v--^--|--v--v--o +// | | | | | | | | | | +// o--v--^--|--^--|--v-----|--|--------|--|-----v--v--------o +// | | | | | | | | +// o--^--|--v--v--v--------v--|--------|--v-----------------o +// | | | | +// o--v--v--------------------v--------v--------------------o +// +// [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11]] +// [[1,3],[5,7],[9,11],[0,2],[4,6],[8,10]] +// [[1,2],[5,6],[9,10],[0,4],[7,11]] +// [[1,5],[6,10],[3,7],[4,8]] +// [[5,9],[2,6],[0,4],[7,11],[3,8]] +// [[1,5],[6,10],[2,3],[8,9]] +// [[1,4],[7,10],[3,5],[6,8]] +// [[2,4],[7,9],[5,6]] +// [[3,4],[7,8]] +#define SORT_NETWORK_12(TYPE, CMP, begin) \ + do { \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ + } while (0) + +// 45 comparators, 10 parallel operations +// o--------^--^-----^-----------------------------^-----------------o +// | | | | +// o--^-----|--v-----|-----^--------------^-----^--|-----^-----------o +// | | | | | | | | +// o--|-----|--^--^--v-----|--------------|--^--|--|--^--v--^--------o +// | | | | | | | | | | | +// o--|--^--|--|--v-----^--|--------^-----|--|--v--|--|--^--v-----^--o +// | | | | | | | | | | | | | +// o--|--v--|--|--^-----|--v-----^--v-----|--|--^--|--|--|--^--^--v--o +// | | | | | | | | | | | | | | +// o--|--^--|--|--|--^--|--------|-----^--|--|--|--v--v--v--|--v--^--o +// | | | | | | | | | | | | | | +// o--|--|--|--v--v--|--|--^-----|--^--v--|--v--|--^--------v--^--v--o +// | | | | | | | | | | | | +// o--v--|--|-----^--|--v--|--^--|--|-----v-----v--|--^--------v-----o +// | | | | | | | | | | +// o-----v--|--^--|--|-----|--v--|--|--^-----^-----v--v--^-----------o +// | | | | | | | | | | +// o--^-----|--|--|--v-----|-----v--|--v--^--|--^--------v-----------o +// | | | | | | | | | +// o--|-----|--|--|--^-----|--------v--^--|--v--v--------------------o +// | | | | | | | | +// o--v-----|--v--|--v-----|--^--------v--v--------------------------o +// | | | | +// o--------v-----v--------v--v--------------------------------------o +// +// [[1,7],[9,11],[3,4],[5,8],[0,12],[2,6]] +// [[0,1],[2,3],[4,6],[8,11],[7,12],[5,9]] +// [[0,2],[3,7],[10,11],[1,4],[6,12]] +// [[7,8],[11,12],[4,9],[6,10]] +// [[3,4],[5,6],[8,9],[10,11],[1,7]] +// [[2,6],[9,11],[1,3],[4,7],[8,10],[0,5]] +// [[2,5],[6,8],[9,10]] +// [[1,2],[3,5],[7,8],[4,6]] +// [[2,3],[4,5],[6,7],[8,9]] +// [[3,4],[5,6]] +#define SORT_NETWORK_13(TYPE, CMP, begin) \ + do { \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ + } while (0) + +/* *INDENT-OFF* */ +/* clang-format off */ + +// 51 comparators, 10 parallel operations +// o--^--^-----^-----------^-----------------------------------------------------------o +// | | | | +// o--v--|--^--|--^--------|--^-----^-----------------------^--------------------------o +// | | | | | | | | +// o--^--v--|--|--|--^-----|--|--^--v-----------------------|--^--^--------------------o +// | | | | | | | | | | | +// o--v-----v--|--|--|--^--|--|--|--^--------------^--------|--|--|--^--^--^-----------o +// | | | | | | | | | | | | | | | +// o--^--^-----v--|--|--|--|--|--|--|--^-----------|-----^--v--|--v--|--|--v-----------o +// | | | | | | | | | | | | | | | +// o--v--|--^-----v--|--|--|--|--|--|--|--^--^-----|-----|-----|--^--|--v-----^--------o +// | | | | | | | | | | | | | | | | | +// o--^--v--|--------v--|--|--|--|--|--|--|--|--^--|-----|-----|--v--|-----^--v-----^--o +// | | | | | | | | | | | | | | | | | +// o--v-----v-----------v--|--|--|--|--|--|--|--|--|--^--|--^--|-----|--^--|--^--^--v--o +// | | | | | | | | | | | | | | | | | | +// o--^--^-----^-----------v--|--|--|--|--|--|--|--|--|--v--|--v-----v--|--v--|--v--^--o +// | | | | | | | | | | | | | | | | +// o--v--|--^--|--^-----------v--|--|--|--|--|--v--|--|-----|--^--------|-----v--^--v--o +// | | | | | | | | | | | | | | | +// o--^--v--|--|--|--------------v--|--|--|--v-----|--|-----|--v--------|--^-----v-----o +// | | | | | | | | | | | | +// o--v-----v--|--|-----------------v--|--|--------|--v-----|--^--------|--|--^--------o +// | | | | | | | | | | +// o--^--------v--|--------------------v--|--------v--------|--|--------v--v--v--------o +// | | | | | +// o--v-----------v-----------------------v-----------------v--v-----------------------o +// +// [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13]] +// [[0,2],[4,6],[8,10],[1,3],[5,7],[9,11]] +// [[0,4],[8,12],[1,5],[9,13],[2,6],[3,7]] +// [[0,8],[1,9],[2,10],[3,11],[4,12],[5,13]] +// [[5,10],[6,9],[3,12],[7,11],[1,2],[4,8]] +// [[1,4],[7,13],[2,8],[5,6],[9,10]] +// [[2,4],[11,13],[3,8],[7,12]] +// [[6,8],[10,12],[3,5],[7,9]] +// [[3,4],[5,6],[7,8],[9,10],[11,12]] +// [[6,7],[8,9]] + +/* *INDENT-ON* */ +/* clang-format on */ + +#define SORT_NETWORK_14(TYPE, CMP, begin) \ + do { \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[13]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[13]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[13]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[13]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[13]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ + } while (0) + +/* *INDENT-OFF* */ +/* clang-format off */ + +// 56 comparators, 10 parallel operations +// o--^--^-----^-----------^--------------------------------------------------------------o +// | | | | +// o--v--|--^--|--^--------|--^-----^--------------------------^--------------------------o +// | | | | | | | | +// o--^--v--|--|--|--^-----|--|--^--v--------------------------|--^--^--------------------o +// | | | | | | | | | | | +// o--v-----v--|--|--|--^--|--|--|--^-----------------^--------|--|--|--^--^--^-----------o +// | | | | | | | | | | | | | | | +// o--^--^-----v--|--|--|--|--|--|--|--^--------------|-----^--v--|--v--|--|--v-----------o +// | | | | | | | | | | | | | | | +// o--v--|--^-----v--|--|--|--|--|--|--|--^-----^-----|-----|-----|--^--|--v-----^--------o +// | | | | | | | | | | | | | | | | | +// o--^--v--|--------v--|--|--|--|--|--|--|--^--|--^--|-----|-----|--v--|-----^--v-----^--o +// | | | | | | | | | | | | | | | | | | +// o--v-----v-----------v--|--|--|--|--|--|--|--|--|--|--^--|--^--|-----|--^--|--^--^--v--o +// | | | | | | | | | | | | | | | | | | | +// o--^--^-----^-----------v--|--|--|--|--|--|--|--|--|--|--v--|--v-----v--|--v--|--v--^--o +// | | | | | | | | | | | | | | | | | +// o--v--|--^--|--^-----------v--|--|--|--|--|--|--v--|--|-----|--^--------|-----v--^--v--o +// | | | | | | | | | | | | | | | | +// o--^--v--|--|--|--^-----------v--|--|--|--|--v-----|--|-----|--v--------|--^-----v-----o +// | | | | | | | | | | | | | | +// o--v-----v--|--|--|--------------v--|--|--|--------|--v-----|--^--^-----|--|--^--------o +// | | | | | | | | | | | | | +// o--^--^-----v--|--|-----------------v--|--|--------v--------|--|--|-----v--v--v--------o +// | | | | | | | | | +// o--v--|--------v--|--------------------v--|--^--------------v--|--v--------------------o +// | | | | | +// o-----v-----------v-----------------------v--v-----------------v-----------------------o +// +// [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13]] +// [[0,2],[4,6],[8,10],[12,14],[1,3],[5,7],[9,11]] +// [[0,4],[8,12],[1,5],[9,13],[2,6],[10,14],[3,7]] +// [[0,8],[1,9],[2,10],[3,11],[4,12],[5,13],[6,14]] +// [[5,10],[6,9],[3,12],[13,14],[7,11],[1,2],[4,8]] +// [[1,4],[7,13],[2,8],[11,14],[5,6],[9,10]] +// [[2,4],[11,13],[3,8],[7,12]] +// [[6,8],[10,12],[3,5],[7,9]] +// [[3,4],[5,6],[7,8],[9,10],[11,12]] +// [[6,7],[8,9]] + +/* *INDENT-ON* */ +/* clang-format on */ + +#define SORT_NETWORK_15(TYPE, CMP, begin) \ + do { \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[13]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[14]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[13]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[14]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[13]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[14]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[13], begin[14]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[13]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[14]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[13]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ + } while (0) + +/* *INDENT-OFF* */ +/* clang-format off */ + +// 60 comparators, 10 parallel operations +// o--^--^-----^-----------^-----------------------------------------------------------------o +// | | | | +// o--v--|--^--|--^--------|--^-----^-----------------------------^--------------------------o +// | | | | | | | | +// o--^--v--|--|--|--^-----|--|--^--v-----------------------------|--^--^--------------------o +// | | | | | | | | | | | +// o--v-----v--|--|--|--^--|--|--|--^--------------------^--------|--|--|--^--^--^-----------o +// | | | | | | | | | | | | | | | +// o--^--^-----v--|--|--|--|--|--|--|--^-----------------|-----^--v--|--v--|--|--v-----------o +// | | | | | | | | | | | | | | | +// o--v--|--^-----v--|--|--|--|--|--|--|--^--------^-----|-----|-----|--^--|--v-----^--------o +// | | | | | | | | | | | | | | | | | +// o--^--v--|--------v--|--|--|--|--|--|--|--^-----|--^--|-----|-----|--v--|-----^--v-----^--o +// | | | | | | | | | | | | | | | | | | +// o--v-----v-----------v--|--|--|--|--|--|--|--^--|--|--|--^--|--^--|-----|--^--|--^--^--v--o +// | | | | | | | | | | | | | | | | | | | | +// o--^--^-----^-----------v--|--|--|--|--|--|--|--|--|--|--|--v--|--v-----v--|--v--|--v--^--o +// | | | | | | | | | | | | | | | | | | +// o--v--|--^--|--^-----------v--|--|--|--|--|--|--|--v--|--|-----|--^--------|-----v--^--v--o +// | | | | | | | | | | | | | | | | | +// o--^--v--|--|--|--^-----------v--|--|--|--|--|--v-----|--|-----|--v--------|--^-----v-----o +// | | | | | | | | | | | | | | | +// o--v-----v--|--|--|--^-----------v--|--|--|--|--------|--v-----|--^--^-----|--|--^--------o +// | | | | | | | | | | | | | | | +// o--^--^-----v--|--|--|--------------v--|--|--|--------v--------|--|--|-----v--v--v--------o +// | | | | | | | | | | | +// o--v--|--^-----v--|--|-----------------v--|--|--^--------------v--|--v--------------------o +// | | | | | | | | +// o--^--v--|--------v--|--------------------v--|--v-----------------v-----------------------o +// | | | | +// o--v-----v-----------v-----------------------v--------------------------------------------o +// +// [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13],[14,15]] +// [[0,2],[4,6],[8,10],[12,14],[1,3],[5,7],[9,11],[13,15]] +// [[0,4],[8,12],[1,5],[9,13],[2,6],[10,14],[3,7],[11,15]] +// [[0,8],[1,9],[2,10],[3,11],[4,12],[5,13],[6,14],[7,15]] +// [[5,10],[6,9],[3,12],[13,14],[7,11],[1,2],[4,8]] +// [[1,4],[7,13],[2,8],[11,14],[5,6],[9,10]] +// [[2,4],[11,13],[3,8],[7,12]] +// [[6,8],[10,12],[3,5],[7,9]] +// [[3,4],[5,6],[7,8],[9,10],[11,12]] +// [[6,7],[8,9]] + +/* *INDENT-ON* */ +/* clang-format on */ + +#define SORT_NETWORK_16(TYPE, CMP, begin) \ + do { \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[13]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[14], begin[15]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[14]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[13], begin[15]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[13]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[14]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[15]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[13]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[14]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[15]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[13], begin[14]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[13]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[14]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[13]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[12]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ + } while (0) + +#define SORT_INNER(TYPE, CMP, begin, end, len) \ + switch (len) { \ + default: \ + __unreachable(); \ + case 0: \ + case 1: \ + break; \ + case 2: \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + break; \ + case 3: \ + SORT_NETWORK_3(TYPE, CMP, begin); \ + break; \ + case 4: \ + SORT_NETWORK_4(TYPE, CMP, begin); \ + break; \ + case 5: \ + SORT_NETWORK_5(TYPE, CMP, begin); \ + break; \ + case 6: \ + SORT_NETWORK_6(TYPE, CMP, begin); \ + break; \ + case 7: \ + SORT_NETWORK_7(TYPE, CMP, begin); \ + break; \ + case 8: \ + SORT_NETWORK_8(TYPE, CMP, begin); \ + break; \ + case 9: \ + SORT_NETWORK_9(TYPE, CMP, begin); \ + break; \ + case 10: \ + SORT_NETWORK_10(TYPE, CMP, begin); \ + break; \ + case 11: \ + SORT_NETWORK_11(TYPE, CMP, begin); \ + break; \ + case 12: \ + SORT_NETWORK_12(TYPE, CMP, begin); \ + break; \ + case 13: \ + SORT_NETWORK_13(TYPE, CMP, begin); \ + break; \ + case 14: \ + SORT_NETWORK_14(TYPE, CMP, begin); \ + break; \ + case 15: \ + SORT_NETWORK_15(TYPE, CMP, begin); \ + break; \ + case 16: \ + SORT_NETWORK_16(TYPE, CMP, begin); \ + break; \ + } + +#define SORT_SWAP(TYPE, a, b) \ + do { \ + const TYPE swap_tmp = (a); \ + (a) = (b); \ + (b) = swap_tmp; \ + } while (0) + +#define SORT_PUSH(low, high) \ + do { \ + top->lo = (low); \ + top->hi = (high); \ + ++top; \ + } while (0) + +#define SORT_POP(low, high) \ + do { \ + --top; \ + low = top->lo; \ + high = top->hi; \ + } while (0) + +#define SORT_IMPL(NAME, EXPECT_LOW_CARDINALITY_OR_PRESORTED, TYPE, CMP) \ + \ + static __inline bool NAME##_is_sorted(const TYPE *first, const TYPE *last) { \ + while (++first <= last) \ + if (CMP(first[0], first[-1])) \ + return false; \ + return true; \ + } \ + \ + typedef struct { \ + TYPE *lo, *hi; \ + } NAME##_stack; \ + \ + static __hot void NAME(TYPE *const begin, TYPE *const end) { \ + NAME##_stack stack[sizeof(unsigned) * CHAR_BIT], *top = stack; \ + \ + TYPE *hi = end - 1; \ + TYPE *lo = begin; \ + while (true) { \ + const ptrdiff_t len = hi - lo; \ + if (len < 16) { \ + SORT_INNER(TYPE, CMP, lo, hi + 1, len + 1); \ + if (unlikely(top == stack)) \ + break; \ + SORT_POP(lo, hi); \ + continue; \ + } \ + \ + TYPE *mid = lo + (len >> 1); \ + SORT_CMP_SWAP(TYPE, CMP, *lo, *mid); \ + SORT_CMP_SWAP(TYPE, CMP, *mid, *hi); \ + SORT_CMP_SWAP(TYPE, CMP, *lo, *mid); \ + \ + TYPE *right = hi - 1; \ + TYPE *left = lo + 1; \ + while (1) { \ + while (CMP(*left, *mid)) \ + ++left; \ + while (CMP(*mid, *right)) \ + --right; \ + if (unlikely(left > right)) { \ + if (EXPECT_LOW_CARDINALITY_OR_PRESORTED) { \ + if (NAME##_is_sorted(lo, right)) \ + lo = right + 1; \ + if (NAME##_is_sorted(left, hi)) \ + hi = left; \ + } \ + break; \ + } \ + SORT_SWAP(TYPE, *left, *right); \ + mid = (mid == left) ? right : (mid == right) ? left : mid; \ + ++left; \ + --right; \ + } \ + \ + if (right - lo > hi - left) { \ + SORT_PUSH(lo, right); \ + lo = left; \ + } else { \ + SORT_PUSH(left, hi); \ + hi = right; \ + } \ + } \ + \ + if (mdbx_audit_enabled()) { \ + for (TYPE *scan = begin + 1; scan < end; ++scan) \ + assert(CMP(scan[-1], scan[0])); \ + } \ + } + +/*------------------------------------------------------------------------------ + * LY: radix sort for large chunks */ + +#define RADIXSORT_IMPL(NAME, TYPE, EXTRACT_KEY, BUFFER_PREALLOCATED, END_GAP) \ + \ + __hot static bool NAME##_radixsort(TYPE *const begin, \ + const unsigned length) { \ + TYPE *tmp; \ + if (BUFFER_PREALLOCATED) { \ + tmp = begin + length + END_GAP; \ + /* memset(tmp, 0xDeadBeef, sizeof(TYPE) * length); */ \ + } else { \ + tmp = mdbx_malloc(sizeof(TYPE) * length); \ + if (unlikely(!tmp)) \ + return false; \ + } \ + \ + unsigned key_shift = 0, key_diff_mask; \ + do { \ + struct { \ + unsigned a[256], b[256]; \ + } counters; \ + memset(&counters, 0, sizeof(counters)); \ + \ + key_diff_mask = 0; \ + unsigned prev_key = EXTRACT_KEY(begin) >> key_shift; \ + TYPE *r = begin, *end = begin + length; \ + do { \ + const unsigned key = EXTRACT_KEY(r) >> key_shift; \ + counters.a[key & 255]++; \ + counters.b[(key >> 8) & 255]++; \ + key_diff_mask |= prev_key ^ key; \ + prev_key = key; \ + } while (++r != end); \ + \ + unsigned ta = 0, tb = 0; \ + for (unsigned i = 0; i < 256; ++i) { \ + const unsigned ia = counters.a[i]; \ + counters.a[i] = ta; \ + ta += ia; \ + const unsigned ib = counters.b[i]; \ + counters.b[i] = tb; \ + tb += ib; \ + } \ + \ + r = begin; \ + do { \ + const unsigned key = EXTRACT_KEY(r) >> key_shift; \ + tmp[counters.a[key & 255]++] = *r; \ + } while (++r != end); \ + \ + if (unlikely(key_diff_mask < 256)) { \ + memcpy(begin, tmp, (char *)end - (char *)begin); \ + break; \ + } \ + end = (r = tmp) + length; \ + do { \ + const unsigned key = EXTRACT_KEY(r) >> key_shift; \ + begin[counters.b[(key >> 8) & 255]++] = *r; \ + } while (++r != end); \ + \ + key_shift += 16; \ + } while (key_diff_mask >> 16); \ + \ + if (!(BUFFER_PREALLOCATED)) \ + mdbx_free(tmp); \ + return true; \ + } + +/*------------------------------------------------------------------------------ + * LY: Binary search */ + +#define SEARCH_IMPL(NAME, TYPE_LIST, TYPE_ARG, CMP) \ + static __always_inline const TYPE_LIST *NAME( \ + const TYPE_LIST *first, unsigned length, const TYPE_ARG item) { \ + const TYPE_LIST *const begin = first, *const end = begin + length; \ + \ + while (length > 3) { \ + const unsigned whole = length; \ + length >>= 1; \ + const TYPE_LIST *const middle = first + length; \ + const unsigned left = whole - length - 1; \ + const bool cmp = CMP(*middle, item); \ + length = cmp ? left : length; \ + first = cmp ? middle + 1 : first; \ + } \ + \ + switch (length) { \ + case 3: \ + if (!CMP(*first, item)) \ + break; \ + ++first; \ + __fallthrough /* fall through */; \ + case 2: \ + if (!CMP(*first, item)) \ + break; \ + ++first; \ + __fallthrough /* fall through */; \ + case 1: \ + if (!CMP(*first, item)) \ + break; \ + ++first; \ + __fallthrough /* fall through */; \ + case 0: \ + break; \ + default: \ + __unreachable(); \ + } \ + \ + if (mdbx_audit_enabled()) { \ + for (const TYPE_LIST *scan = begin; scan < first; ++scan) \ + assert(CMP(*scan, item)); \ + for (const TYPE_LIST *scan = first; scan < end; ++scan) \ + assert(!CMP(*scan, item)); \ + (void)begin, (void)end; \ + } \ + \ + return first; \ + } + +/*----------------------------------------------------------------------------*/ + +static __always_inline size_t pnl2bytes(size_t size) { + assert(size > 0 && size <= MDBX_PGL_LIMIT); +#if MDBX_PNL_PREALLOC_FOR_RADIXSORT + size += size; +#endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */ + STATIC_ASSERT(MDBX_ASSUME_MALLOC_OVERHEAD + + (MDBX_PGL_LIMIT * (MDBX_PNL_PREALLOC_FOR_RADIXSORT + 1) + + MDBX_PNL_GRANULATE + 2) * + sizeof(pgno_t) < + SIZE_MAX / 4 * 3); + size_t bytes = + ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(pgno_t) * (size + 2), + MDBX_PNL_GRANULATE * sizeof(pgno_t)) - + MDBX_ASSUME_MALLOC_OVERHEAD; + return bytes; +} + +static __always_inline pgno_t bytes2pnl(const size_t bytes) { + size_t size = bytes / sizeof(pgno_t); + assert(size > 2 && size <= MDBX_PGL_LIMIT); + size -= 2; +#if MDBX_PNL_PREALLOC_FOR_RADIXSORT + size >>= 1; +#endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */ + return (pgno_t)size; +} + +static MDBX_PNL mdbx_pnl_alloc(size_t size) { + size_t bytes = pnl2bytes(size); + MDBX_PNL pl = mdbx_malloc(bytes); + if (likely(pl)) { +#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) + bytes = malloc_usable_size(pl); +#endif /* malloc_usable_size */ + pl[0] = bytes2pnl(bytes); + assert(pl[0] >= size); + pl[1] = 0; + pl += 1; + } + return pl; +} + +static void mdbx_pnl_free(MDBX_PNL pl) { + if (likely(pl)) + mdbx_free(pl - 1); +} + +/* Shrink the PNL to the default size if it has grown larger */ +static void mdbx_pnl_shrink(MDBX_PNL *ppl) { + assert(bytes2pnl(pnl2bytes(MDBX_PNL_INITIAL)) >= MDBX_PNL_INITIAL && + bytes2pnl(pnl2bytes(MDBX_PNL_INITIAL)) < MDBX_PNL_INITIAL * 3 / 2); + assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && + MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); + MDBX_PNL_SIZE(*ppl) = 0; + if (unlikely(MDBX_PNL_ALLOCLEN(*ppl) > + MDBX_PNL_INITIAL * 2 - MDBX_CACHELINE_SIZE / sizeof(pgno_t))) { + size_t bytes = pnl2bytes(MDBX_PNL_INITIAL); + MDBX_PNL pl = mdbx_realloc(*ppl - 1, bytes); + if (likely(pl)) { +#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) + bytes = malloc_usable_size(pl); +#endif /* malloc_usable_size */ + *pl = bytes2pnl(bytes); + *ppl = pl + 1; + } + } +} + +/* Grow the PNL to the size growed to at least given size */ +static int mdbx_pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { + const size_t allocated = MDBX_PNL_ALLOCLEN(*ppl); + assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && + MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); + if (likely(allocated >= wanna)) + return MDBX_SUCCESS; + + if (unlikely(wanna > /* paranoia */ MDBX_PGL_LIMIT)) { + mdbx_error("PNL too long (%zu > %zu)", wanna, (size_t)MDBX_PGL_LIMIT); + return MDBX_TXN_FULL; + } + + const size_t size = (wanna + wanna - allocated < MDBX_PGL_LIMIT) + ? wanna + wanna - allocated + : MDBX_PGL_LIMIT; + size_t bytes = pnl2bytes(size); + MDBX_PNL pl = mdbx_realloc(*ppl - 1, bytes); + if (likely(pl)) { +#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) + bytes = malloc_usable_size(pl); +#endif /* malloc_usable_size */ + *pl = bytes2pnl(bytes); + assert(*pl >= wanna); + *ppl = pl + 1; + return MDBX_SUCCESS; + } + return MDBX_ENOMEM; +} + +/* Make room for num additional elements in an PNL */ +static __always_inline int __must_check_result mdbx_pnl_need(MDBX_PNL *ppl, + size_t num) { + assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && + MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); + assert(num <= MDBX_PGL_LIMIT); + const size_t wanna = MDBX_PNL_SIZE(*ppl) + num; + return likely(MDBX_PNL_ALLOCLEN(*ppl) >= wanna) + ? MDBX_SUCCESS + : mdbx_pnl_reserve(ppl, wanna); +} + +static __always_inline void mdbx_pnl_xappend(MDBX_PNL pl, pgno_t pgno) { + assert(MDBX_PNL_SIZE(pl) < MDBX_PNL_ALLOCLEN(pl)); + if (mdbx_audit_enabled()) { + for (unsigned i = MDBX_PNL_SIZE(pl); i > 0; --i) + assert(pgno != pl[i]); + } + MDBX_PNL_SIZE(pl) += 1; + MDBX_PNL_LAST(pl) = pgno; +} + +/* Append an pgno range onto an unsorted PNL */ +__always_inline static int __must_check_result +mdbx_pnl_append_range(bool spilled, MDBX_PNL *ppl, pgno_t pgno, unsigned n) { + assert(n > 0); + int rc = mdbx_pnl_need(ppl, n); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + const MDBX_PNL pnl = *ppl; +#if MDBX_PNL_ASCENDING + unsigned w = MDBX_PNL_SIZE(pnl); + do { + pnl[++w] = pgno; + pgno += spilled ? 2 : 1; + } while (--n); + MDBX_PNL_SIZE(pnl) = w; +#else + unsigned w = MDBX_PNL_SIZE(pnl) + n; + MDBX_PNL_SIZE(pnl) = w; + do { + pnl[w--] = pgno; + pgno += spilled ? 2 : 1; + } while (--n); +#endif + + return MDBX_SUCCESS; +} + +/* Append an pgno range into the sorted PNL */ +static __hot int __must_check_result mdbx_pnl_insert_range(MDBX_PNL *ppl, + pgno_t pgno, + unsigned n) { + assert(n > 0); + int rc = mdbx_pnl_need(ppl, n); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + const MDBX_PNL pnl = *ppl; + unsigned r = MDBX_PNL_SIZE(pnl), w = r + n; + MDBX_PNL_SIZE(pnl) = w; + while (r && MDBX_PNL_DISORDERED(pnl[r], pgno)) + pnl[w--] = pnl[r--]; + + for (pgno_t fill = MDBX_PNL_ASCENDING ? pgno + n : pgno; w > r; --w) + pnl[w] = MDBX_PNL_ASCENDING ? --fill : fill++; + + return MDBX_SUCCESS; +} + +static bool mdbx_pnl_check(const MDBX_PNL pl, const pgno_t limit) { + if (likely(MDBX_PNL_SIZE(pl))) { + assert(MDBX_PNL_LEAST(pl) >= MIN_PAGENO); + assert(MDBX_PNL_MOST(pl) < limit); + assert(MDBX_PNL_SIZE(pl) <= MDBX_PGL_LIMIT); + if (unlikely(MDBX_PNL_SIZE(pl) > MDBX_PGL_LIMIT)) + return false; + if (unlikely(MDBX_PNL_LEAST(pl) < MIN_PAGENO)) + return false; + if (unlikely(MDBX_PNL_MOST(pl) >= limit)) + return false; + if (mdbx_audit_enabled()) { + for (const pgno_t *scan = &MDBX_PNL_LAST(pl); --scan > pl;) { + assert(MDBX_PNL_ORDERED(scan[0], scan[1])); + if (unlikely(!MDBX_PNL_ORDERED(scan[0], scan[1]))) + return false; + } + } + } + return true; +} + +static __always_inline bool mdbx_pnl_check4assert(const MDBX_PNL pl, + const pgno_t limit) { + if (unlikely(pl == nullptr)) + return true; + assert(MDBX_PNL_ALLOCLEN(pl) >= MDBX_PNL_SIZE(pl)); + if (unlikely(MDBX_PNL_ALLOCLEN(pl) < MDBX_PNL_SIZE(pl))) + return false; + return mdbx_pnl_check(pl, limit); +} + +/* Merge an PNL onto an PNL. The destination PNL must be big enough */ +static void __hot mdbx_pnl_xmerge(MDBX_PNL dst, const MDBX_PNL src) { + assert(mdbx_pnl_check4assert(dst, MAX_PAGENO + 1)); + assert(mdbx_pnl_check(src, MAX_PAGENO + 1)); + const size_t total = MDBX_PNL_SIZE(dst) + MDBX_PNL_SIZE(src); + assert(MDBX_PNL_ALLOCLEN(dst) >= total); + pgno_t *w = dst + total; + pgno_t *d = dst + MDBX_PNL_SIZE(dst); + const pgno_t *s = src + MDBX_PNL_SIZE(src); + dst[0] = /* detent for scan below */ (MDBX_PNL_ASCENDING ? 0 : ~(pgno_t)0); + while (s > src) { + while (MDBX_PNL_ORDERED(*s, *d)) + *w-- = *d--; + *w-- = *s--; + } + MDBX_PNL_SIZE(dst) = (pgno_t)total; + assert(mdbx_pnl_check4assert(dst, MAX_PAGENO + 1)); +} + +static void mdbx_spill_remove(MDBX_txn *txn, unsigned idx, unsigned npages) { + mdbx_tassert(txn, idx > 0 && idx <= MDBX_PNL_SIZE(txn->tw.spill_pages) && + txn->tw.spill_least_removed > 0); + txn->tw.spill_least_removed = + (idx < txn->tw.spill_least_removed) ? idx : txn->tw.spill_least_removed; + txn->tw.spill_pages[idx] |= 1; + MDBX_PNL_SIZE(txn->tw.spill_pages) -= + (idx == MDBX_PNL_SIZE(txn->tw.spill_pages)); + + while (unlikely(npages > 1)) { + const pgno_t pgno = (txn->tw.spill_pages[idx] >> 1) + 1; + if (MDBX_PNL_ASCENDING) { + if (++idx > MDBX_PNL_SIZE(txn->tw.spill_pages) || + (txn->tw.spill_pages[idx] >> 1) != pgno) + return; + } else { + if (--idx < 1 || (txn->tw.spill_pages[idx] >> 1) != pgno) + return; + txn->tw.spill_least_removed = (idx < txn->tw.spill_least_removed) + ? idx + : txn->tw.spill_least_removed; + } + txn->tw.spill_pages[idx] |= 1; + MDBX_PNL_SIZE(txn->tw.spill_pages) -= + (idx == MDBX_PNL_SIZE(txn->tw.spill_pages)); + --npages; + } +} + +static MDBX_PNL mdbx_spill_purge(MDBX_txn *txn) { + mdbx_tassert(txn, txn->tw.spill_least_removed > 0); + const MDBX_PNL sl = txn->tw.spill_pages; + if (txn->tw.spill_least_removed != INT_MAX) { + unsigned len = MDBX_PNL_SIZE(sl), r, w; + for (w = r = txn->tw.spill_least_removed; r <= len; ++r) { + sl[w] = sl[r]; + w += 1 - (sl[r] & 1); + } + for (size_t i = 1; i < w; ++i) + mdbx_tassert(txn, (sl[i] & 1) == 0); + MDBX_PNL_SIZE(sl) = w - 1; + txn->tw.spill_least_removed = INT_MAX; + } else { + for (size_t i = 1; i <= MDBX_PNL_SIZE(sl); ++i) + mdbx_tassert(txn, (sl[i] & 1) == 0); + } + return sl; +} + +#if MDBX_PNL_ASCENDING +#define MDBX_PNL_EXTRACT_KEY(ptr) (*(ptr)) +#else +#define MDBX_PNL_EXTRACT_KEY(ptr) (P_INVALID - *(ptr)) +#endif +RADIXSORT_IMPL(pgno, pgno_t, MDBX_PNL_EXTRACT_KEY, + MDBX_PNL_PREALLOC_FOR_RADIXSORT, 0) + +SORT_IMPL(pgno_sort, false, pgno_t, MDBX_PNL_ORDERED) +static __hot void mdbx_pnl_sort(MDBX_PNL pnl) { + if (likely(MDBX_PNL_SIZE(pnl) < MDBX_RADIXSORT_THRESHOLD) || + unlikely(!pgno_radixsort(&MDBX_PNL_FIRST(pnl), MDBX_PNL_SIZE(pnl)))) + pgno_sort(MDBX_PNL_BEGIN(pnl), MDBX_PNL_END(pnl)); + assert(mdbx_pnl_check(pnl, MAX_PAGENO + 1)); +} + +/* Search for an pgno in an PNL. + * Returns The index of the first item greater than or equal to pgno. */ +SEARCH_IMPL(pgno_bsearch, pgno_t, pgno_t, MDBX_PNL_ORDERED) + +static __hot unsigned mdbx_pnl_search(const MDBX_PNL pnl, pgno_t pgno) { + assert(mdbx_pnl_check4assert(pnl, MAX_PAGENO + 1)); + const pgno_t *begin = MDBX_PNL_BEGIN(pnl); + const pgno_t *it = pgno_bsearch(begin, MDBX_PNL_SIZE(pnl), pgno); + const pgno_t *end = begin + MDBX_PNL_SIZE(pnl); + assert(it >= begin && it <= end); + if (it != begin) + assert(MDBX_PNL_ORDERED(it[-1], pgno)); + if (it != end) + assert(!MDBX_PNL_ORDERED(it[0], pgno)); + return (unsigned)(it - begin + 1); +} + +static __inline unsigned mdbx_pnl_exist(const MDBX_PNL pnl, pgno_t pgno) { + unsigned n = mdbx_pnl_search(pnl, pgno); + return (n <= MDBX_PNL_SIZE(pnl) && pnl[n] == pgno) ? n : 0; +} + +static __inline unsigned mdbx_pnl_intersect(const MDBX_PNL pnl, pgno_t pgno, + unsigned npages) { + const unsigned len = MDBX_PNL_SIZE(pnl); + if (mdbx_log_enabled(MDBX_LOG_EXTRA)) { + mdbx_debug_extra("PNL len %u [", len); + for (unsigned i = 1; i <= len; ++i) + mdbx_debug_extra_print(" %" PRIaPGNO, pnl[i]); + mdbx_debug_extra_print("%s\n", "]"); + } + const pgno_t range_last = pgno + npages - 1; +#if MDBX_PNL_ASCENDING + const unsigned n = mdbx_pnl_search(pnl, pgno); + assert(n && (n == MDBX_PNL_SIZE(pnl) + 1 || pgno <= pnl[n])); + const bool rc = n <= MDBX_PNL_SIZE(pnl) && pnl[n] <= range_last; +#else + const unsigned n = mdbx_pnl_search(pnl, range_last); + assert(n && (n == MDBX_PNL_SIZE(pnl) + 1 || range_last >= pnl[n])); + const bool rc = n <= MDBX_PNL_SIZE(pnl) && pnl[n] >= pgno; +#endif + if (mdbx_assert_enabled()) { + bool check = false; + for (unsigned i = 0; i < npages; ++i) + check |= mdbx_pnl_exist(pnl, pgno + i) != 0; + assert(check == rc); + } + return rc; +} + +/*----------------------------------------------------------------------------*/ + +static __always_inline size_t txl2bytes(const size_t size) { + assert(size > 0 && size <= MDBX_TXL_MAX * 2); + size_t bytes = + ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(txnid_t) * (size + 2), + MDBX_TXL_GRANULATE * sizeof(txnid_t)) - + MDBX_ASSUME_MALLOC_OVERHEAD; + return bytes; +} + +static __always_inline size_t bytes2txl(const size_t bytes) { + size_t size = bytes / sizeof(txnid_t); + assert(size > 2 && size <= MDBX_TXL_MAX * 2); + return size - 2; +} + +static MDBX_TXL mdbx_txl_alloc(void) { + size_t bytes = txl2bytes(MDBX_TXL_INITIAL); + MDBX_TXL tl = mdbx_malloc(bytes); + if (likely(tl)) { +#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) + bytes = malloc_usable_size(tl); +#endif /* malloc_usable_size */ + tl[0] = bytes2txl(bytes); + assert(tl[0] >= MDBX_TXL_INITIAL); + tl[1] = 0; + tl += 1; + } + return tl; +} + +static void mdbx_txl_free(MDBX_TXL tl) { + if (likely(tl)) + mdbx_free(tl - 1); +} + +static int mdbx_txl_reserve(MDBX_TXL *ptl, const size_t wanna) { + const size_t allocated = (size_t)MDBX_PNL_ALLOCLEN(*ptl); + assert(MDBX_PNL_SIZE(*ptl) <= MDBX_TXL_MAX && + MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_SIZE(*ptl)); + if (likely(allocated >= wanna)) + return MDBX_SUCCESS; + + if (unlikely(wanna > /* paranoia */ MDBX_TXL_MAX)) { + mdbx_error("TXL too long (%zu > %zu)", wanna, (size_t)MDBX_TXL_MAX); + return MDBX_TXN_FULL; + } + + const size_t size = (wanna + wanna - allocated < MDBX_TXL_MAX) + ? wanna + wanna - allocated + : MDBX_TXL_MAX; + size_t bytes = txl2bytes(size); + MDBX_TXL tl = mdbx_realloc(*ptl - 1, bytes); + if (likely(tl)) { +#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) + bytes = malloc_usable_size(tl); +#endif /* malloc_usable_size */ + *tl = bytes2txl(bytes); + assert(*tl >= wanna); + *ptl = tl + 1; + return MDBX_SUCCESS; + } + return MDBX_ENOMEM; +} + +static __always_inline int __must_check_result mdbx_txl_need(MDBX_TXL *ptl, + size_t num) { + assert(MDBX_PNL_SIZE(*ptl) <= MDBX_TXL_MAX && + MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_SIZE(*ptl)); + assert(num <= MDBX_PGL_LIMIT); + const size_t wanna = (size_t)MDBX_PNL_SIZE(*ptl) + num; + return likely(MDBX_PNL_ALLOCLEN(*ptl) >= wanna) + ? MDBX_SUCCESS + : mdbx_txl_reserve(ptl, wanna); +} + +static __always_inline void mdbx_txl_xappend(MDBX_TXL tl, txnid_t id) { + assert(MDBX_PNL_SIZE(tl) < MDBX_PNL_ALLOCLEN(tl)); + MDBX_PNL_SIZE(tl) += 1; + MDBX_PNL_LAST(tl) = id; +} + +#define TXNID_SORT_CMP(first, last) ((first) > (last)) +SORT_IMPL(txnid_sort, false, txnid_t, TXNID_SORT_CMP) +static void mdbx_txl_sort(MDBX_TXL tl) { + txnid_sort(MDBX_PNL_BEGIN(tl), MDBX_PNL_END(tl)); +} + +static int __must_check_result mdbx_txl_append(MDBX_TXL *ptl, txnid_t id) { + if (unlikely(MDBX_PNL_SIZE(*ptl) == MDBX_PNL_ALLOCLEN(*ptl))) { + int rc = mdbx_txl_need(ptl, MDBX_TXL_GRANULATE); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + mdbx_txl_xappend(*ptl, id); + return MDBX_SUCCESS; +} + +/*----------------------------------------------------------------------------*/ + +#define MDBX_DPL_UNSORTED_BACKLOG 16 +#define MDBX_DPL_GAP_FOR_MERGESORT MDBX_DPL_UNSORTED_BACKLOG +#define MDBX_DPL_GAP_FOR_EDGING 2 +#define MDBX_DPL_RESERVE_GAP \ + (MDBX_DPL_GAP_FOR_MERGESORT + MDBX_DPL_GAP_FOR_EDGING) + +static __always_inline size_t dpl2bytes(ptrdiff_t size) { + assert(size > CURSOR_STACK && (size_t)size <= MDBX_PGL_LIMIT); +#if MDBX_DPL_PREALLOC_FOR_RADIXSORT + size += size; +#endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ + STATIC_ASSERT(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(MDBX_dpl) + + (MDBX_PGL_LIMIT * (MDBX_DPL_PREALLOC_FOR_RADIXSORT + 1) + + MDBX_DPL_RESERVE_GAP) * + sizeof(MDBX_dp) + + MDBX_PNL_GRANULATE * sizeof(void *) * 2 < + SIZE_MAX / 4 * 3); + size_t bytes = + ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(MDBX_dpl) + + ((size_t)size + MDBX_DPL_RESERVE_GAP) * sizeof(MDBX_dp), + MDBX_PNL_GRANULATE * sizeof(void *) * 2) - + MDBX_ASSUME_MALLOC_OVERHEAD; + return bytes; +} + +static __always_inline unsigned bytes2dpl(const ptrdiff_t bytes) { + size_t size = (bytes - sizeof(MDBX_dpl)) / sizeof(MDBX_dp); + assert(size > CURSOR_STACK + MDBX_DPL_RESERVE_GAP && + size <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); + size -= MDBX_DPL_RESERVE_GAP; +#if MDBX_DPL_PREALLOC_FOR_RADIXSORT + size >>= 1; +#endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ + return (unsigned)size; +} + +static __always_inline unsigned dpl_setlen(MDBX_dpl *dl, unsigned len) { + static const MDBX_page dpl_stub_pageE = { + {0}, 0, P_BAD, {0}, /* pgno */ ~(pgno_t)0}; + assert(dpl_stub_pageE.mp_flags == P_BAD && + dpl_stub_pageE.mp_pgno == P_INVALID); + dl->length = len; + dl->items[len + 1].ptr = (MDBX_page *)&dpl_stub_pageE; + dl->items[len + 1].pgno = P_INVALID; + dl->items[len + 1].extra = 0; + return len; +} + +static __always_inline void dpl_clear(MDBX_dpl *dl) { + static const MDBX_page dpl_stub_pageB = {{0}, 0, P_BAD, {0}, /* pgno */ 0}; + assert(dpl_stub_pageB.mp_flags == P_BAD && dpl_stub_pageB.mp_pgno == 0); + dl->sorted = dpl_setlen(dl, 0); + dl->items[0].ptr = (MDBX_page *)&dpl_stub_pageB; + dl->items[0].pgno = 0; + dl->items[0].extra = 0; + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); +} + +static void mdbx_dpl_free(MDBX_txn *txn) { + if (likely(txn->tw.dirtylist)) { + mdbx_free(txn->tw.dirtylist); + txn->tw.dirtylist = NULL; + } +} + +static MDBX_dpl *mdbx_dpl_reserve(MDBX_txn *txn, size_t size) { + size_t bytes = dpl2bytes((size < MDBX_PGL_LIMIT) ? size : MDBX_PGL_LIMIT); + MDBX_dpl *const dl = mdbx_realloc(txn->tw.dirtylist, bytes); + if (likely(dl)) { +#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) + bytes = malloc_usable_size(dl); +#endif /* malloc_usable_size */ + dl->detent = bytes2dpl(bytes); + mdbx_tassert(txn, txn->tw.dirtylist == NULL || dl->length <= dl->detent); + txn->tw.dirtylist = dl; + } + return dl; +} + +static int mdbx_dpl_alloc(MDBX_txn *txn) { + mdbx_tassert(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + const int wanna = (txn->mt_env->me_options.dp_initial < txn->mt_geo.upper) + ? txn->mt_env->me_options.dp_initial + : txn->mt_geo.upper; + if (txn->tw.dirtylist) { + dpl_clear(txn->tw.dirtylist); + const int realloc_threshold = 64; + if (likely( + !((int)(txn->tw.dirtylist->detent - wanna) > realloc_threshold || + (int)(txn->tw.dirtylist->detent - wanna) < -realloc_threshold))) + return MDBX_SUCCESS; + } + if (unlikely(!mdbx_dpl_reserve(txn, wanna))) + return MDBX_ENOMEM; + dpl_clear(txn->tw.dirtylist); + return MDBX_SUCCESS; +} + +#define MDBX_DPL_EXTRACT_KEY(ptr) ((ptr)->pgno) +RADIXSORT_IMPL(dpl, MDBX_dp, MDBX_DPL_EXTRACT_KEY, + MDBX_DPL_PREALLOC_FOR_RADIXSORT, 1) + +#define DP_SORT_CMP(first, last) ((first).pgno < (last).pgno) +SORT_IMPL(dp_sort, false, MDBX_dp, DP_SORT_CMP) + +__hot __noinline static MDBX_dpl *mdbx_dpl_sort_slowpath(const MDBX_txn *txn) { + MDBX_dpl *dl = txn->tw.dirtylist; + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + const unsigned unsorted = dl->length - dl->sorted; + if (likely(unsorted < MDBX_RADIXSORT_THRESHOLD) || + unlikely(!dpl_radixsort(dl->items + 1, dl->length))) { + if (dl->sorted > unsorted / 4 + 4 && + (MDBX_DPL_PREALLOC_FOR_RADIXSORT || + dl->length + unsorted < dl->detent + MDBX_DPL_GAP_FOR_MERGESORT)) { + MDBX_dp *const sorted_begin = dl->items + 1; + MDBX_dp *const sorted_end = sorted_begin + dl->sorted; + MDBX_dp *const end = + dl->items + (MDBX_DPL_PREALLOC_FOR_RADIXSORT + ? dl->length + dl->length + 1 + : dl->detent + MDBX_DPL_RESERVE_GAP); + MDBX_dp *const tmp = end - unsorted; + assert(dl->items + dl->length + 1 < tmp); + /* copy unsorted to the end of allocated space and sort it */ + memcpy(tmp, sorted_end, unsorted * sizeof(MDBX_dp)); + dp_sort(tmp, tmp + unsorted); + /* merge two parts from end to begin */ + MDBX_dp *w = dl->items + dl->length; + MDBX_dp *l = dl->items + dl->sorted; + MDBX_dp *r = end - 1; + do { + const bool cmp = l->pgno > r->pgno; + *w = cmp ? *l : *r; + l -= cmp; + r += cmp - 1; + } while (likely(--w > l)); + assert(r == tmp - 1); + assert(dl->items[0].pgno == 0 && + dl->items[dl->length + 1].pgno == P_INVALID); + if (mdbx_assert_enabled()) + for (unsigned i = 0; i <= dl->length; ++i) + assert(dl->items[i].pgno < dl->items[i + 1].pgno); + } else { + dp_sort(dl->items + 1, dl->items + dl->length + 1); + assert(dl->items[0].pgno == 0 && + dl->items[dl->length + 1].pgno == P_INVALID); + } + } else { + assert(dl->items[0].pgno == 0 && + dl->items[dl->length + 1].pgno == P_INVALID); + } + dl->sorted = dl->length; + return dl; +} + +static __always_inline MDBX_dpl *mdbx_dpl_sort(const MDBX_txn *txn) { + MDBX_dpl *dl = txn->tw.dirtylist; + assert(dl->length <= MDBX_PGL_LIMIT); + assert(dl->sorted <= dl->length); + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + return likely(dl->sorted == dl->length) ? dl : mdbx_dpl_sort_slowpath(txn); +} + +/* Returns the index of the first dirty-page whose pgno + * member is greater than or equal to id. */ +#define DP_SEARCH_CMP(dp, id) ((dp).pgno < (id)) +SEARCH_IMPL(dp_bsearch, MDBX_dp, pgno_t, DP_SEARCH_CMP) + +static unsigned __hot mdbx_dpl_search(const MDBX_txn *txn, pgno_t pgno) { + MDBX_dpl *dl = txn->tw.dirtylist; + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + if (mdbx_audit_enabled()) { + for (const MDBX_dp *ptr = dl->items + dl->sorted; --ptr > dl->items;) { + assert(ptr[0].pgno < ptr[1].pgno); + assert(ptr[0].pgno >= NUM_METAS); + } + } + + switch (dl->length - dl->sorted) { + default: + /* sort a whole */ + mdbx_dpl_sort_slowpath(txn); + break; + case 0: + /* whole sorted cases */ + break; + +#define LINEAR_SEARCH_CASE(N) \ + case N: \ + if (dl->items[dl->length - N + 1].pgno == pgno) \ + return dl->length - N + 1; \ + __fallthrough + + /* try linear search until the threshold */ + LINEAR_SEARCH_CASE(16); /* fall through */ + LINEAR_SEARCH_CASE(15); /* fall through */ + LINEAR_SEARCH_CASE(14); /* fall through */ + LINEAR_SEARCH_CASE(13); /* fall through */ + LINEAR_SEARCH_CASE(12); /* fall through */ + LINEAR_SEARCH_CASE(11); /* fall through */ + LINEAR_SEARCH_CASE(10); /* fall through */ + LINEAR_SEARCH_CASE(9); /* fall through */ + LINEAR_SEARCH_CASE(8); /* fall through */ + LINEAR_SEARCH_CASE(7); /* fall through */ + LINEAR_SEARCH_CASE(6); /* fall through */ + LINEAR_SEARCH_CASE(5); /* fall through */ + LINEAR_SEARCH_CASE(4); /* fall through */ + LINEAR_SEARCH_CASE(3); /* fall through */ + LINEAR_SEARCH_CASE(2); /* fall through */ + case 1: + if (dl->items[dl->length].pgno == pgno) + return dl->length; + /* continue bsearch on the sorted part */ + break; + } + return (unsigned)(dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items); +} + +MDBX_NOTHROW_PURE_FUNCTION static __inline unsigned +dpl_npages(const MDBX_dpl *dl, unsigned i) { + assert(0 <= (int)i && i <= dl->length); + unsigned n = likely(!dl->items[i].multi) ? 1 : dl->items[i].ptr->mp_pages; + assert(n == (IS_OVERFLOW(dl->items[i].ptr) ? dl->items[i].ptr->mp_pages : 1)); + return n; +} + +MDBX_NOTHROW_PURE_FUNCTION static __inline unsigned +dpl_endpgno(const MDBX_dpl *dl, unsigned i) { + return dpl_npages(dl, i) + dl->items[i].pgno; +} + +static __inline bool mdbx_dpl_intersect(const MDBX_txn *txn, pgno_t pgno, + unsigned npages) { + MDBX_dpl *dl = txn->tw.dirtylist; + assert(dl->sorted == dl->length); + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + unsigned const n = mdbx_dpl_search(txn, pgno); + assert(n >= 1 && n <= dl->length + 1); + assert(pgno <= dl->items[n].pgno); + assert(pgno > dl->items[n - 1].pgno); + const bool rc = + /* intersection with founded */ pgno + npages > dl->items[n].pgno || + /* intersection with prev */ dpl_endpgno(dl, n - 1) > pgno; + if (mdbx_assert_enabled()) { + bool check = false; + for (unsigned i = 1; i <= dl->length; ++i) { + const MDBX_page *const dp = dl->items[i].ptr; + if (!(dp->mp_pgno /* begin */ >= /* end */ pgno + npages || + dpl_endpgno(dl, i) /* end */ <= /* begin */ pgno)) + check |= true; + } + assert(check == rc); + } + return rc; +} + +static __always_inline unsigned mdbx_dpl_exist(MDBX_txn *txn, pgno_t pgno) { + MDBX_dpl *dl = txn->tw.dirtylist; + unsigned i = mdbx_dpl_search(txn, pgno); + assert((int)i > 0); + return (dl->items[i].pgno == pgno) ? i : 0; +} + +MDBX_MAYBE_UNUSED static const MDBX_page *debug_dpl_find(const MDBX_txn *txn, + const pgno_t pgno) { + const MDBX_dpl *dl = txn->tw.dirtylist; + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + for (unsigned i = dl->length; i > dl->sorted; --i) + if (dl->items[i].pgno == pgno) + return dl->items[i].ptr; + + if (dl->sorted) { + const unsigned i = + (unsigned)(dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items); + if (dl->items[i].pgno == pgno) + return dl->items[i].ptr; + } + return nullptr; +} + +static void mdbx_dpl_remove(const MDBX_txn *txn, unsigned i) { + MDBX_dpl *dl = txn->tw.dirtylist; + assert((int)i > 0 && i <= dl->length); + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + dl->sorted -= dl->sorted >= i; + dl->length -= 1; + memmove(dl->items + i, dl->items + i + 1, + (dl->length - i + 2) * sizeof(dl->items[0])); + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); +} + +static __always_inline int __must_check_result +mdbx_dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page, unsigned npages) { + MDBX_dpl *dl = txn->tw.dirtylist; + assert(dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + if (mdbx_audit_enabled()) { + for (unsigned i = dl->length; i > 0; --i) { + assert(dl->items[i].pgno != pgno); + if (unlikely(dl->items[i].pgno == pgno)) { + mdbx_error("Page %u already exist in the DPL at %u", pgno, i); + return MDBX_PROBLEM; + } + } + } + + const unsigned length = dl->length + 1; + const unsigned sorted = + (dl->sorted == dl->length && dl->items[dl->length].pgno < pgno) + ? length + : dl->sorted; + + if (unlikely(dl->length == dl->detent)) { + if (unlikely(dl->detent >= MDBX_PGL_LIMIT)) { + mdbx_error("DPL is full (MDBX_PGL_LIMIT %zu)", MDBX_PGL_LIMIT); + return MDBX_TXN_FULL; + } + const size_t size = (dl->detent < MDBX_PNL_INITIAL * 42) + ? dl->detent + dl->detent + : dl->detent + dl->detent / 2; + dl = mdbx_dpl_reserve(txn, size); + if (unlikely(!dl)) + return MDBX_ENOMEM; + mdbx_tassert(txn, dl->length < dl->detent); + } + + /* copy the stub beyond the end */ + dl->items[length + 1] = dl->items[length]; + /* append page */ + dl->items[length].ptr = page; + dl->items[length].pgno = pgno; + dl->items[length].multi = npages > 1; + dl->items[length].lru = txn->tw.dirtylru++; + dl->length = length; + dl->sorted = sorted; + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + return MDBX_SUCCESS; +} + +static __inline uint32_t mdbx_dpl_age(const MDBX_txn *txn, unsigned i) { + const MDBX_dpl *dl = txn->tw.dirtylist; + assert((int)i > 0 && i <= dl->length); + /* overflow could be here */ + return (txn->tw.dirtylru - dl->items[i].lru) & UINT32_C(0x7fffFFFF); +} + +/*----------------------------------------------------------------------------*/ + +uint8_t mdbx_runtime_flags = MDBX_RUNTIME_FLAGS_INIT; +uint8_t mdbx_loglevel = MDBX_LOG_FATAL; +MDBX_debug_func *mdbx_debug_logger; + +static __must_check_result __inline int mdbx_page_retire(MDBX_cursor *mc, + MDBX_page *mp); + +static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp, + unsigned npages); +struct page_result { + MDBX_page *page; + int err; +}; + +static struct page_result mdbx_page_alloc(MDBX_cursor *mc, const unsigned num, + int flags); +static txnid_t mdbx_kick_longlived_readers(MDBX_env *env, + const txnid_t laggard); + +static struct page_result mdbx_page_new(MDBX_cursor *mc, const unsigned flags, + const unsigned npages); +static int mdbx_page_touch(MDBX_cursor *mc); +static int mdbx_cursor_touch(MDBX_cursor *mc); +static int mdbx_touch_dbi(MDBX_cursor *mc); + +#define MDBX_END_NAMES \ + { \ + "committed", "empty-commit", "abort", "reset", "reset-tmp", "fail-begin", \ + "fail-beginchild" \ + } +enum { + /* mdbx_txn_end operation number, for logging */ + MDBX_END_COMMITTED, + MDBX_END_PURE_COMMIT, + MDBX_END_ABORT, + MDBX_END_RESET, + MDBX_END_RESET_TMP, + MDBX_END_FAIL_BEGIN, + MDBX_END_FAIL_BEGINCHILD +}; +#define MDBX_END_OPMASK 0x0F /* mask for mdbx_txn_end() operation number */ +#define MDBX_END_UPDATE 0x10 /* update env state (DBIs) */ +#define MDBX_END_FREE 0x20 /* free txn unless it is MDBX_env.me_txn0 */ +#define MDBX_END_EOTDONE 0x40 /* txn's cursors already closed */ +#define MDBX_END_SLOT 0x80 /* release any reader slot if MDBX_NOTLS */ +static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode); + +__hot static struct page_result __must_check_result +mdbx_page_get_ex(MDBX_cursor *const mc, const pgno_t pgno, txnid_t front); +static __inline int __must_check_result mdbx_page_get(MDBX_cursor *mc, + pgno_t pgno, + MDBX_page **mp, + txnid_t front) { + + struct page_result ret = mdbx_page_get_ex(mc, pgno, front); + *mp = ret.page; + return ret.err; +} + +static int __must_check_result mdbx_page_search_root(MDBX_cursor *mc, + const MDBX_val *key, + int flags); + +#define MDBX_PS_MODIFY 1 +#define MDBX_PS_ROOTONLY 2 +#define MDBX_PS_FIRST 4 +#define MDBX_PS_LAST 8 +static int __must_check_result mdbx_page_search(MDBX_cursor *mc, + const MDBX_val *key, int flags); +static int __must_check_result mdbx_page_merge(MDBX_cursor *csrc, + MDBX_cursor *cdst); + +#define MDBX_SPLIT_REPLACE MDBX_APPENDDUP /* newkey is not new */ +static int __must_check_result mdbx_page_split(MDBX_cursor *mc, + const MDBX_val *const newkey, + MDBX_val *const newdata, + pgno_t newpgno, unsigned nflags); + +static int __must_check_result mdbx_read_header(MDBX_env *env, MDBX_meta *meta, + uint64_t *filesize, + const int lck_exclusive, + const mdbx_mode_t mode_bits); +static int __must_check_result mdbx_sync_locked(MDBX_env *env, unsigned flags, + MDBX_meta *const pending); +static int mdbx_env_close0(MDBX_env *env); + +struct node_result { + MDBX_node *node; + bool exact; +}; + +static struct node_result mdbx_node_search(MDBX_cursor *mc, + const MDBX_val *key); + +static int __must_check_result mdbx_node_add_branch(MDBX_cursor *mc, + unsigned indx, + const MDBX_val *key, + pgno_t pgno); +static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, + unsigned indx, + const MDBX_val *key, + MDBX_val *data, + unsigned flags); +static int __must_check_result mdbx_node_add_leaf2(MDBX_cursor *mc, + unsigned indx, + const MDBX_val *key); + +static void mdbx_node_del(MDBX_cursor *mc, size_t ksize); +static void mdbx_node_shrink(MDBX_page *mp, unsigned indx); +static int __must_check_result mdbx_node_move(MDBX_cursor *csrc, + MDBX_cursor *cdst, bool fromleft); +static int __must_check_result mdbx_node_read(MDBX_cursor *mc, MDBX_node *leaf, + MDBX_val *data, + const txnid_t front); +static int __must_check_result mdbx_rebalance(MDBX_cursor *mc); +static int __must_check_result mdbx_update_key(MDBX_cursor *mc, + const MDBX_val *key); + +static void mdbx_cursor_pop(MDBX_cursor *mc); +static int __must_check_result mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp); + +static int __must_check_result mdbx_audit_ex(MDBX_txn *txn, + unsigned retired_stored, + bool dont_filter_gc); + +static int __must_check_result mdbx_page_check(MDBX_cursor *const mc, + const MDBX_page *const mp, + unsigned options); +static int __must_check_result mdbx_cursor_check(MDBX_cursor *mc, + unsigned options); +static int __must_check_result mdbx_cursor_del0(MDBX_cursor *mc); +static int __must_check_result mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, + const MDBX_val *key, + const MDBX_val *data, unsigned flags); +#define SIBLING_LEFT 0 +#define SIBLING_RIGHT 2 +static int __must_check_result mdbx_cursor_sibling(MDBX_cursor *mc, int dir); +static int __must_check_result mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data, + MDBX_cursor_op op); +static int __must_check_result mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data, + MDBX_cursor_op op); +struct cursor_set_result { + int err; + bool exact; +}; + +static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data, + MDBX_cursor_op op); +static int __must_check_result mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data); +static int __must_check_result mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data); + +static int __must_check_result mdbx_cursor_init(MDBX_cursor *mc, MDBX_txn *txn, + MDBX_dbi dbi); +static int __must_check_result mdbx_xcursor_init0(MDBX_cursor *mc); +static int __must_check_result mdbx_xcursor_init1(MDBX_cursor *mc, + MDBX_node *node, + const MDBX_page *mp); +static int __must_check_result mdbx_xcursor_init2(MDBX_cursor *mc, + MDBX_xcursor *src_mx, + bool new_dupdata); +static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst); + +static int __must_check_result mdbx_drop_tree(MDBX_cursor *mc, + const bool may_have_subDBs); +static int __must_check_result mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi); +static int __must_check_result mdbx_setup_dbx(MDBX_dbx *const dbx, + const MDBX_db *const db, + const unsigned pagesize); + +static MDBX_cmp_func cmp_lexical, cmp_reverse, cmp_int_align4, cmp_int_align2, + cmp_int_unaligned, cmp_lenfast; + +static __inline MDBX_cmp_func *get_default_keycmp(unsigned flags); +static __inline MDBX_cmp_func *get_default_datacmp(unsigned flags); + +__cold const char *mdbx_liberr2str(int errnum) { + /* Table of descriptions for MDBX errors */ + static const char *const tbl[] = { + "MDBX_KEYEXIST: Key/data pair already exists", + "MDBX_NOTFOUND: No matching key/data pair found", + "MDBX_PAGE_NOTFOUND: Requested page not found", + "MDBX_CORRUPTED: Database is corrupted", + "MDBX_PANIC: Environment had fatal error", + "MDBX_VERSION_MISMATCH: DB version mismatch libmdbx", + "MDBX_INVALID: File is not an MDBX file", + "MDBX_MAP_FULL: Environment mapsize limit reached", + "MDBX_DBS_FULL: Too many DBI-handles (maxdbs reached)", + "MDBX_READERS_FULL: Too many readers (maxreaders reached)", + NULL /* MDBX_TLS_FULL (-30789): unused in MDBX */, + "MDBX_TXN_FULL: Transaction has too many dirty pages," + " i.e transaction is too big", + "MDBX_CURSOR_FULL: Cursor stack limit reachedn - this usually indicates" + " corruption, i.e branch-pages loop", + "MDBX_PAGE_FULL: Internal error - Page has no more space", + "MDBX_UNABLE_EXTEND_MAPSIZE: Database engine was unable to extend" + " mapping, e.g. since address space is unavailable or busy," + " or Operation system not supported such operations", + "MDBX_INCOMPATIBLE: Environment or database is not compatible" + " with the requested operation or the specified flags", + "MDBX_BAD_RSLOT: Invalid reuse of reader locktable slot," + " e.g. read-transaction already run for current thread", + "MDBX_BAD_TXN: Transaction is not valid for requested operation," + " e.g. had errored and be must aborted, has a child, or is invalid", + "MDBX_BAD_VALSIZE: Invalid size or alignment of key or data" + " for target database, either invalid subDB name", + "MDBX_BAD_DBI: The specified DBI-handle is invalid" + " or changed by another thread/transaction", + "MDBX_PROBLEM: Unexpected internal error, transaction should be aborted", + "MDBX_BUSY: Another write transaction is running," + " or environment is already used while opening with MDBX_EXCLUSIVE flag", + }; + + if (errnum >= MDBX_KEYEXIST && errnum <= MDBX_BUSY) { + int i = errnum - MDBX_KEYEXIST; + return tbl[i]; + } + + switch (errnum) { + case MDBX_SUCCESS: + return "MDBX_SUCCESS: Successful"; + case MDBX_EMULTIVAL: + return "MDBX_EMULTIVAL: The specified key has" + " more than one associated value"; + case MDBX_EBADSIGN: + return "MDBX_EBADSIGN: Wrong signature of a runtime object(s)," + " e.g. memory corruption or double-free"; + case MDBX_WANNA_RECOVERY: + return "MDBX_WANNA_RECOVERY: Database should be recovered," + " but this could NOT be done automatically for now" + " since it opened in read-only mode"; + case MDBX_EKEYMISMATCH: + return "MDBX_EKEYMISMATCH: The given key value is mismatched to the" + " current cursor position"; + case MDBX_TOO_LARGE: + return "MDBX_TOO_LARGE: Database is too large for current system," + " e.g. could NOT be mapped into RAM"; + case MDBX_THREAD_MISMATCH: + return "MDBX_THREAD_MISMATCH: A thread has attempted to use a not" + " owned object, e.g. a transaction that started by another thread"; + case MDBX_TXN_OVERLAPPING: + return "MDBX_TXN_OVERLAPPING: Overlapping read and write transactions for" + " the current thread"; + default: + return NULL; + } +} + +const char *__cold mdbx_strerror_r(int errnum, char *buf, size_t buflen) { + const char *msg = mdbx_liberr2str(errnum); + if (!msg && buflen > 0 && buflen < INT_MAX) { +#if defined(_WIN32) || defined(_WIN64) + const DWORD size = FormatMessageA( + FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, + errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen, + NULL); + return size ? buf : "FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM) failed"; +#elif defined(_GNU_SOURCE) && defined(__GLIBC__) + /* GNU-specific */ + if (errnum > 0) + msg = strerror_r(errnum, buf, buflen); +#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) + /* XSI-compliant */ + if (errnum > 0 && strerror_r(errnum, buf, buflen) == 0) + msg = buf; +#else + if (errnum > 0) { + msg = strerror(errnum); + if (msg) { + strncpy(buf, msg, buflen); + msg = buf; + } + } +#endif + if (!msg) { + (void)snprintf(buf, buflen, "error %d", errnum); + msg = buf; + } + buf[buflen - 1] = '\0'; + } + return msg; +} + +const char *__cold mdbx_strerror(int errnum) { +#if defined(_WIN32) || defined(_WIN64) + static char buf[1024]; + return mdbx_strerror_r(errnum, buf, sizeof(buf)); +#else + const char *msg = mdbx_liberr2str(errnum); + if (!msg) { + if (errnum > 0) + msg = strerror(errnum); + if (!msg) { + static char buf[32]; + (void)snprintf(buf, sizeof(buf) - 1, "error %d", errnum); + msg = buf; + } + } + return msg; +#endif +} + +#if defined(_WIN32) || defined(_WIN64) /* Bit of madness for Windows */ +const char *mdbx_strerror_r_ANSI2OEM(int errnum, char *buf, size_t buflen) { + const char *msg = mdbx_liberr2str(errnum); + if (!msg && buflen > 0 && buflen < INT_MAX) { + const DWORD size = FormatMessageA( + FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, + errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen, + NULL); + if (!size) + msg = "FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM) failed"; + else if (!CharToOemBuffA(buf, buf, size)) + msg = "CharToOemBuffA() failed"; + else + msg = buf; + } + return msg; +} + +const char *mdbx_strerror_ANSI2OEM(int errnum) { + static char buf[1024]; + return mdbx_strerror_r_ANSI2OEM(errnum, buf, sizeof(buf)); +} +#endif /* Bit of madness for Windows */ + +void __cold mdbx_debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args) { + if (mdbx_debug_logger) + mdbx_debug_logger(level, function, line, fmt, args); + else { +#if defined(_WIN32) || defined(_WIN64) + if (IsDebuggerPresent()) { + int prefix_len = 0; + char *prefix = nullptr; + if (function && line > 0) + prefix_len = mdbx_asprintf(&prefix, "%s:%d ", function, line); + else if (function) + prefix_len = mdbx_asprintf(&prefix, "%s: ", function); + else if (line > 0) + prefix_len = mdbx_asprintf(&prefix, "%d: ", line); + if (prefix_len > 0 && prefix) { + OutputDebugStringA(prefix); + mdbx_free(prefix); + } + char *msg = nullptr; + int msg_len = mdbx_vasprintf(&msg, fmt, args); + if (msg_len > 0 && msg) { + OutputDebugStringA(msg); + mdbx_free(msg); + } + } +#else + if (function && line > 0) + fprintf(stderr, "%s:%d ", function, line); + else if (function) + fprintf(stderr, "%s: ", function); + else if (line > 0) + fprintf(stderr, "%d: ", line); + vfprintf(stderr, fmt, args); + fflush(stderr); +#endif + } +} + +void __cold mdbx_debug_log(int level, const char *function, int line, + const char *fmt, ...) { + va_list args; + va_start(args, fmt); + mdbx_debug_log_va(level, function, line, fmt, args); + va_end(args); +} + +/* Dump a key in ascii or hexadecimal. */ +const char *mdbx_dump_val(const MDBX_val *key, char *const buf, + const size_t bufsize) { + if (!key) + return ""; + if (!key->iov_len) + return ""; + if (!buf || bufsize < 4) + return nullptr; + + bool is_ascii = true; + const uint8_t *const data = key->iov_base; + for (unsigned i = 0; i < key->iov_len; i++) + if (data[i] < ' ' || data[i] > '~') { + is_ascii = false; + break; + } + + if (is_ascii) { + int len = + snprintf(buf, bufsize, "%.*s", + (key->iov_len > INT_MAX) ? INT_MAX : (int)key->iov_len, data); + assert(len > 0 && (unsigned)len < bufsize); + (void)len; + } else { + char *const detent = buf + bufsize - 2; + char *ptr = buf; + *ptr++ = '<'; + for (unsigned i = 0; i < key->iov_len; i++) { + const ptrdiff_t left = detent - ptr; + assert(left > 0); + int len = snprintf(ptr, left, "%02x", data[i]); + if (len < 0 || len >= left) + break; + ptr += len; + } + if (ptr < detent) { + ptr[0] = '>'; + ptr[1] = '\0'; + } + } + return buf; +} + +/*------------------------------------------------------------------------------ + LY: debug stuff */ + +static const char *mdbx_leafnode_type(MDBX_node *n) { + static const char *const tp[2][2] = {{"", ": DB"}, + {": sub-page", ": sub-DB"}}; + return F_ISSET(node_flags(n), F_BIGDATA) + ? ": overflow page" + : tp[F_ISSET(node_flags(n), F_DUPDATA)] + [F_ISSET(node_flags(n), F_SUBDATA)]; +} + +/* Display all the keys in the page. */ +MDBX_MAYBE_UNUSED static void mdbx_page_list(MDBX_page *mp) { + pgno_t pgno = mp->mp_pgno; + const char *type; + MDBX_node *node; + unsigned i, nkeys, nsize, total = 0; + MDBX_val key; + DKBUF; + + switch (mp->mp_flags & + (P_BRANCH | P_LEAF | P_LEAF2 | P_META | P_OVERFLOW | P_SUBP)) { + case P_BRANCH: + type = "Branch page"; + break; + case P_LEAF: + type = "Leaf page"; + break; + case P_LEAF | P_SUBP: + type = "Leaf sub-page"; + break; + case P_LEAF | P_LEAF2: + type = "Leaf2 page"; + break; + case P_LEAF | P_LEAF2 | P_SUBP: + type = "Leaf2 sub-page"; + break; + case P_OVERFLOW: + mdbx_verbose("Overflow page %" PRIaPGNO " pages %u\n", pgno, mp->mp_pages); + return; + case P_META: + mdbx_verbose("Meta-page %" PRIaPGNO " txnid %" PRIu64 "\n", pgno, + unaligned_peek_u64(4, page_meta(mp)->mm_txnid_a)); + return; + default: + mdbx_verbose("Bad page %" PRIaPGNO " flags 0x%X\n", pgno, mp->mp_flags); + return; + } + + nkeys = page_numkeys(mp); + mdbx_verbose("%s %" PRIaPGNO " numkeys %u\n", type, pgno, nkeys); + + for (i = 0; i < nkeys; i++) { + if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ + key.iov_len = nsize = mp->mp_leaf2_ksize; + key.iov_base = page_leaf2key(mp, i, nsize); + total += nsize; + mdbx_verbose("key %u: nsize %u, %s\n", i, nsize, DKEY(&key)); + continue; + } + node = page_node(mp, i); + key.iov_len = node_ks(node); + key.iov_base = node->mn_data; + nsize = (unsigned)(NODESIZE + key.iov_len); + if (IS_BRANCH(mp)) { + mdbx_verbose("key %u: page %" PRIaPGNO ", %s\n", i, node_pgno(node), + DKEY(&key)); + total += nsize; + } else { + if (F_ISSET(node_flags(node), F_BIGDATA)) + nsize += sizeof(pgno_t); + else + nsize += (unsigned)node_ds(node); + total += nsize; + nsize += sizeof(indx_t); + mdbx_verbose("key %u: nsize %u, %s%s\n", i, nsize, DKEY(&key), + mdbx_leafnode_type(node)); + } + total = EVEN(total); + } + mdbx_verbose("Total: header %u + contents %u + unused %u\n", + IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower, total, + page_room(mp)); +} + +/*----------------------------------------------------------------------------*/ + +/* Check if there is an initialized xcursor, so XCURSOR_REFRESH() is proper */ +#define XCURSOR_INITED(mc) \ + ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + +/* Update sub-page pointer, if any, in mc->mc_xcursor. + * Needed when the node which contains the sub-page may have moved. + * Called with mp = mc->mc_pg[mc->mc_top], ki = mc->mc_ki[mc->mc_top]. */ +#define XCURSOR_REFRESH(mc, mp, ki) \ + do { \ + MDBX_page *xr_pg = (mp); \ + MDBX_node *xr_node = page_node(xr_pg, ki); \ + if ((node_flags(xr_node) & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) \ + (mc)->mc_xcursor->mx_cursor.mc_pg[0] = node_data(xr_node); \ + } while (0) + +MDBX_MAYBE_UNUSED static bool cursor_is_tracked(const MDBX_cursor *mc) { + for (MDBX_cursor *scan = mc->mc_txn->tw.cursors[mc->mc_dbi]; scan; + scan = scan->mc_next) + if (mc == ((mc->mc_flags & C_SUB) ? &scan->mc_xcursor->mx_cursor : scan)) + return true; + return false; +} + +/* Perform act while tracking temporary cursor mn */ +#define WITH_CURSOR_TRACKING(mn, act) \ + do { \ + mdbx_cassert(&(mn), \ + mn.mc_txn->tw.cursors != NULL /* must be not rdonly txt */); \ + mdbx_cassert(&(mn), !cursor_is_tracked(&(mn))); \ + MDBX_cursor mc_dummy; \ + MDBX_cursor **tracking_head = &(mn).mc_txn->tw.cursors[mn.mc_dbi]; \ + MDBX_cursor *tracked = &(mn); \ + if ((mn).mc_flags & C_SUB) { \ + mc_dummy.mc_flags = C_INITIALIZED; \ + mc_dummy.mc_top = 0; \ + mc_dummy.mc_snum = 0; \ + mc_dummy.mc_xcursor = (MDBX_xcursor *)&(mn); \ + tracked = &mc_dummy; \ + } \ + tracked->mc_next = *tracking_head; \ + *tracking_head = tracked; \ + { act; } \ + *tracking_head = tracked->mc_next; \ + } while (0) + +int mdbx_cmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, + const MDBX_val *b) { + mdbx_assert(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); + return txn->mt_dbxs[dbi].md_cmp(a, b); +} + +int mdbx_dcmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, + const MDBX_val *b) { + mdbx_assert(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); + return txn->mt_dbxs[dbi].md_dcmp(a, b); +} + +/* Allocate memory for a page. + * Re-use old malloc'ed pages first for singletons, otherwise just malloc. + * Set MDBX_TXN_ERROR on failure. */ +static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) { + MDBX_env *env = txn->mt_env; + MDBX_page *np = env->me_dp_reserve; + size_t size = env->me_psize; + if (likely(num == 1 && np)) { + mdbx_assert(env, env->me_dp_reserve_len > 0); + ASAN_UNPOISON_MEMORY_REGION(np, size); + VALGRIND_MEMPOOL_ALLOC(env, np, size); + VALGRIND_MAKE_MEM_DEFINED(&np->mp_next, sizeof(np->mp_next)); + env->me_dp_reserve = np->mp_next; + env->me_dp_reserve_len -= 1; + } else { + size = pgno2bytes(env, num); + np = mdbx_malloc(size); + if (unlikely(!np)) { + txn->mt_flags |= MDBX_TXN_ERROR; + return np; + } + VALGRIND_MEMPOOL_ALLOC(env, np, size); + } + + if ((env->me_flags & MDBX_NOMEMINIT) == 0) { + /* For a single page alloc, we init everything after the page header. + * For multi-page, we init the final page; if the caller needed that + * many pages they will be filling in at least up to the last page. */ + size_t skip = PAGEHDRSZ; + if (num > 1) + skip += pgno2bytes(env, num - 1); + memset((char *)np + skip, 0, size - skip); + } +#if MDBX_DEBUG + np->mp_pgno = 0; +#endif + VALGRIND_MAKE_MEM_UNDEFINED(np, size); + np->mp_flags = 0; + np->mp_pages = num; + return np; +} + +/* Free a shadow dirty page */ +static void mdbx_dpage_free(MDBX_env *env, MDBX_page *dp, unsigned npages) { + VALGRIND_MAKE_MEM_UNDEFINED(dp, pgno2bytes(env, npages)); + ASAN_UNPOISON_MEMORY_REGION(dp, pgno2bytes(env, npages)); + if (MDBX_DEBUG != 0 || unlikely(env->me_flags & MDBX_PAGEPERTURB)) + memset(dp, -1, pgno2bytes(env, npages)); + if (npages == 1 && + env->me_dp_reserve_len < env->me_options.dp_reserve_limit) { + ASAN_POISON_MEMORY_REGION((char *)dp + sizeof(dp->mp_next), + pgno2bytes(env, npages) - sizeof(dp->mp_next)); + dp->mp_next = env->me_dp_reserve; + VALGRIND_MEMPOOL_FREE(env, dp); + env->me_dp_reserve = dp; + env->me_dp_reserve_len += 1; + } else { + /* large pages just get freed directly */ + VALGRIND_MEMPOOL_FREE(env, dp); + mdbx_free(dp); + } +} + +/* Return all dirty pages to dpage list */ +static void mdbx_dlist_free(MDBX_txn *txn) { + MDBX_env *env = txn->mt_env; + MDBX_dpl *const dl = txn->tw.dirtylist; + + for (unsigned i = 1; i <= dl->length; i++) { + MDBX_page *dp = dl->items[i].ptr; + mdbx_dpage_free(env, dp, dpl_npages(dl, i)); + } + + dpl_clear(dl); +} + +static __always_inline MDBX_db *mdbx_outer_db(MDBX_cursor *mc) { + mdbx_cassert(mc, (mc->mc_flags & C_SUB) != 0); + MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db); + MDBX_cursor_couple *couple = container_of(mx, MDBX_cursor_couple, inner); + mdbx_cassert(mc, mc->mc_db == &couple->outer.mc_xcursor->mx_db); + mdbx_cassert(mc, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); + return couple->outer.mc_db; +} + +MDBX_MAYBE_UNUSED __cold static bool mdbx_dirtylist_check(MDBX_txn *txn) { + const MDBX_dpl *const dl = txn->tw.dirtylist; + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + mdbx_tassert(txn, txn->tw.dirtyroom + dl->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + + if (!mdbx_audit_enabled()) + return true; + + unsigned loose = 0; + for (unsigned i = dl->length; i > 0; --i) { + const MDBX_page *const dp = dl->items[i].ptr; + if (!dp) + continue; + + mdbx_tassert(txn, dp->mp_pgno == dl->items[i].pgno); + if (unlikely(dp->mp_pgno != dl->items[i].pgno)) + return false; + + const uint32_t age = mdbx_dpl_age(txn, i); + mdbx_tassert(txn, age < UINT32_MAX / 3); + if (unlikely(age > UINT32_MAX / 3)) + return false; + + mdbx_tassert(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); + if (dp->mp_flags == P_LOOSE) { + loose += 1; + } else if (unlikely(!IS_MODIFIABLE(txn, dp))) + return false; + + const unsigned num = dpl_npages(dl, i); + mdbx_tassert(txn, txn->mt_next_pgno >= dp->mp_pgno + num); + if (unlikely(txn->mt_next_pgno < dp->mp_pgno + num)) + return false; + + if (i < dl->sorted) { + mdbx_tassert(txn, dl->items[i + 1].pgno >= dp->mp_pgno + num); + if (unlikely(dl->items[i + 1].pgno < dp->mp_pgno + num)) + return false; + } + + const unsigned rpa = mdbx_pnl_search(txn->tw.reclaimed_pglist, dp->mp_pgno); + mdbx_tassert(txn, rpa > MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) || + txn->tw.reclaimed_pglist[rpa] != dp->mp_pgno); + if (rpa <= MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) && + unlikely(txn->tw.reclaimed_pglist[rpa] == dp->mp_pgno)) + return false; + if (num > 1) { + const unsigned rpb = + mdbx_pnl_search(txn->tw.reclaimed_pglist, dp->mp_pgno + num - 1); + mdbx_tassert(txn, rpa == rpb); + if (unlikely(rpa != rpb)) + return false; + } + } + + mdbx_tassert(txn, loose == txn->tw.loose_count); + if (unlikely(loose != txn->tw.loose_count)) + return false; + + for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.retired_pages); ++i) { + const MDBX_page *const dp = debug_dpl_find(txn, txn->tw.retired_pages[i]); + mdbx_tassert(txn, !dp); + if (unlikely(dp)) + return false; + } + + return true; +} + +#if MDBX_ENABLE_REFUND +static void mdbx_refund_reclaimed(MDBX_txn *txn) { + /* Scanning in descend order */ + pgno_t next_pgno = txn->mt_next_pgno; + const MDBX_PNL pnl = txn->tw.reclaimed_pglist; + mdbx_tassert(txn, MDBX_PNL_SIZE(pnl) && MDBX_PNL_MOST(pnl) == next_pgno - 1); +#if MDBX_PNL_ASCENDING + unsigned i = MDBX_PNL_SIZE(pnl); + mdbx_tassert(txn, pnl[i] == next_pgno - 1); + while (--next_pgno, --i > 0 && pnl[i] == next_pgno - 1) + ; + MDBX_PNL_SIZE(pnl) = i; +#else + unsigned i = 1; + mdbx_tassert(txn, pnl[i] == next_pgno - 1); + unsigned len = MDBX_PNL_SIZE(pnl); + while (--next_pgno, ++i <= len && pnl[i] == next_pgno - 1) + ; + MDBX_PNL_SIZE(pnl) = len -= i - 1; + for (unsigned move = 0; move < len; ++move) + pnl[1 + move] = pnl[i + move]; +#endif + mdbx_verbose("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, + txn->mt_next_pgno - next_pgno, txn->mt_next_pgno, next_pgno); + txn->mt_next_pgno = next_pgno; + mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - 1)); +} + +static void mdbx_refund_loose(MDBX_txn *txn) { + mdbx_tassert(txn, txn->tw.loose_pages != nullptr); + mdbx_tassert(txn, txn->tw.loose_count > 0); + + MDBX_dpl *const dl = txn->tw.dirtylist; + mdbx_tassert(txn, dl->length >= txn->tw.loose_count); + + pgno_t onstack[MDBX_CACHELINE_SIZE * 8 / sizeof(pgno_t)]; + MDBX_PNL suitable = onstack; + + if (dl->length - dl->sorted > txn->tw.loose_count) { + /* Dirty list is useless since unsorted. */ + if (bytes2pnl(sizeof(onstack)) < txn->tw.loose_count) { + suitable = mdbx_pnl_alloc(txn->tw.loose_count); + if (unlikely(!suitable)) + return /* this is not a reason for transaction fail */; + } + + /* Collect loose-pages which may be refunded. */ + mdbx_tassert(txn, txn->mt_next_pgno >= MIN_PAGENO + txn->tw.loose_count); + pgno_t most = MIN_PAGENO; + unsigned w = 0; + for (const MDBX_page *lp = txn->tw.loose_pages; lp; lp = lp->mp_next) { + mdbx_tassert(txn, lp->mp_flags == P_LOOSE); + mdbx_tassert(txn, txn->mt_next_pgno > lp->mp_pgno); + if (likely(txn->mt_next_pgno - txn->tw.loose_count <= lp->mp_pgno)) { + mdbx_tassert(txn, + w < ((suitable == onstack) ? bytes2pnl(sizeof(onstack)) + : MDBX_PNL_ALLOCLEN(suitable))); + suitable[++w] = lp->mp_pgno; + most = (lp->mp_pgno > most) ? lp->mp_pgno : most; + } + } + + if (most + 1 == txn->mt_next_pgno) { + /* Sort suitable list and refund pages at the tail. */ + MDBX_PNL_SIZE(suitable) = w; + mdbx_pnl_sort(suitable); + + /* Scanning in descend order */ + const int step = MDBX_PNL_ASCENDING ? -1 : 1; + const int begin = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(suitable) : 1; + const int end = MDBX_PNL_ASCENDING ? 0 : MDBX_PNL_SIZE(suitable) + 1; + mdbx_tassert(txn, suitable[begin] >= suitable[end - step]); + mdbx_tassert(txn, most == suitable[begin]); + + for (int i = begin + step; i != end; i += step) { + if (suitable[i] != most - 1) + break; + most -= 1; + } + const unsigned refunded = txn->mt_next_pgno - most; + mdbx_debug("refund-suitable %u pages %" PRIaPGNO " -> %" PRIaPGNO, + refunded, most, txn->mt_next_pgno); + txn->tw.loose_count -= refunded; + txn->tw.dirtyroom += refunded; + assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit); + txn->mt_next_pgno = most; + + /* Filter-out dirty list */ + unsigned r = 0; + w = 0; + if (dl->sorted) { + do { + if (dl->items[++r].pgno < most) { + if (++w != r) + dl->items[w] = dl->items[r]; + } + } while (r < dl->sorted); + dl->sorted = w; + } + while (r < dl->length) { + if (dl->items[++r].pgno < most) { + if (++w != r) + dl->items[w] = dl->items[r]; + } + } + dpl_setlen(dl, w); + mdbx_tassert(txn, + txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + + goto unlink_loose; + } + } else { + /* Dirtylist is mostly sorted, just refund loose pages at the end. */ + mdbx_dpl_sort(txn); + mdbx_tassert(txn, dl->length < 2 || + dl->items[1].pgno < dl->items[dl->length].pgno); + mdbx_tassert(txn, dl->sorted == dl->length); + + /* Scan dirtylist tail-forward and cutoff suitable pages. */ + unsigned n; + for (n = dl->length; dl->items[n].pgno == txn->mt_next_pgno - 1 && + dl->items[n].ptr->mp_flags == P_LOOSE; + --n) { + mdbx_tassert(txn, n > 0); + MDBX_page *dp = dl->items[n].ptr; + mdbx_debug("refund-sorted page %" PRIaPGNO, dp->mp_pgno); + mdbx_tassert(txn, dp->mp_pgno == dl->items[n].pgno); + txn->mt_next_pgno -= 1; + } + dpl_setlen(dl, n); + + if (dl->sorted != dl->length) { + const unsigned refunded = dl->sorted - dl->length; + dl->sorted = dl->length; + txn->tw.loose_count -= refunded; + txn->tw.dirtyroom += refunded; + mdbx_tassert(txn, + txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + + /* Filter-out loose chain & dispose refunded pages. */ + unlink_loose: + for (MDBX_page **link = &txn->tw.loose_pages; *link;) { + MDBX_page *dp = *link; + mdbx_tassert(txn, dp->mp_flags == P_LOOSE); + if (txn->mt_next_pgno > dp->mp_pgno) { + link = &dp->mp_next; + } else { + *link = dp->mp_next; + if ((txn->mt_flags & MDBX_WRITEMAP) == 0) + mdbx_dpage_free(txn->mt_env, dp, 1); + } + } + } + } + + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + if (suitable != onstack) + mdbx_pnl_free(suitable); + txn->tw.loose_refund_wl = txn->mt_next_pgno; +} + +static bool mdbx_refund(MDBX_txn *txn) { + const pgno_t before = txn->mt_next_pgno; + + if (txn->tw.loose_pages && txn->tw.loose_refund_wl > txn->mt_next_pgno) + mdbx_refund_loose(txn); + + while (true) { + if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) == 0 || + MDBX_PNL_MOST(txn->tw.reclaimed_pglist) != txn->mt_next_pgno - 1) + break; + + mdbx_refund_reclaimed(txn); + if (!txn->tw.loose_pages || txn->tw.loose_refund_wl <= txn->mt_next_pgno) + break; + + const pgno_t memo = txn->mt_next_pgno; + mdbx_refund_loose(txn); + if (memo == txn->mt_next_pgno) + break; + } + + if (before == txn->mt_next_pgno) + return false; + + if (txn->tw.spill_pages) + /* Squash deleted pagenums if we refunded any */ + mdbx_spill_purge(txn); + + return true; +} +#else /* MDBX_ENABLE_REFUND */ +static __inline bool mdbx_refund(MDBX_txn *txn) { + (void)txn; + /* No online auto-compactification. */ + return false; +} +#endif /* MDBX_ENABLE_REFUND */ + +static __cold void mdbx_kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, + unsigned npages) { + MDBX_env *const env = txn->mt_env; + mdbx_debug("kill %u page(s) %" PRIaPGNO, npages, pgno); + mdbx_assert(env, pgno >= NUM_METAS && npages); + if (!IS_FROZEN(txn, mp)) { + const size_t bytes = pgno2bytes(env, npages); + memset(mp, -1, bytes); + mp->mp_pgno = pgno; + if ((env->me_flags & MDBX_WRITEMAP) == 0) + mdbx_pwrite(env->me_lazy_fd, mp, bytes, pgno2bytes(env, pgno)); + } else { + struct iovec iov[MDBX_COMMIT_PAGES]; + iov[0].iov_len = env->me_psize; + iov[0].iov_base = (char *)env->me_pbuf + env->me_psize; + size_t iov_off = pgno2bytes(env, pgno); + unsigned n = 1; + while (--npages) { + iov[n] = iov[0]; + if (++n == MDBX_COMMIT_PAGES) { + mdbx_pwritev(env->me_lazy_fd, iov, MDBX_COMMIT_PAGES, iov_off, + pgno2bytes(env, MDBX_COMMIT_PAGES)); + iov_off += pgno2bytes(env, MDBX_COMMIT_PAGES); + n = 0; + } + } + mdbx_pwritev(env->me_lazy_fd, iov, n, iov_off, pgno2bytes(env, n)); + } +} + +/* Remove page from dirty list */ +static __inline void mdbx_page_wash(MDBX_txn *txn, const unsigned di, + MDBX_page *const mp, + const unsigned npages) { + mdbx_tassert(txn, di && di <= txn->tw.dirtylist->length && + txn->tw.dirtylist->items[di].ptr == mp); + mdbx_dpl_remove(txn, di); + txn->tw.dirtyroom++; + mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + mp->mp_txnid = INVALID_TXNID; + mp->mp_flags = 0xFFFF; + VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ); + if (txn->mt_flags & MDBX_WRITEMAP) { + VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), + pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); + ASAN_POISON_MEMORY_REGION(page_data(mp), + pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); + } else + mdbx_dpage_free(txn->mt_env, mp, npages); +} + +static __inline txnid_t pp_txnid4chk(const MDBX_page *mp, const MDBX_txn *txn) { + (void)txn; +#if MDBX_DISABLE_PAGECHECKS + (void)mp; + return 0; +#else + return /* maybe zero in legacy DB */ mp->mp_txnid; +#endif /* !MDBX_DISABLE_PAGECHECKS */ +} + +/* Retire, loosen or free a single page. + * + * For dirty pages, saves single pages to a list for future reuse in this same + * txn. It has been pulled from the GC and already resides on the dirty list, + * but has been deleted. Use these pages first before pulling again from the GC. + * + * If the page wasn't dirtied in this txn, just add it + * to this txn's free list. */ +static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, + MDBX_page *mp /* maybe null */, + int pagetype /* maybe unknown/zero */) { + int rc; + MDBX_txn *const txn = mc->mc_txn; + mdbx_tassert(txn, !mp || (mp->mp_pgno == pgno && PAGETYPE(mp) == pagetype)); + + /* During deleting entire subtrees, it is reasonable and possible to avoid + * reading leaf pages, i.e. significantly reduce hard page-faults & IOPs: + * - mp is null, i.e. the page has not yet been read; + * - pagetype is known and the P_LEAF bit is set; + * - we can determine the page status via scanning the lists + * of dirty and spilled pages. + * + * On the other hand, this could be suboptimal for WRITEMAP mode, since + * requires support the list of dirty pages and avoid explicit spilling. + * So for flexibility and avoid extra internal dependencies we just + * fallback to reading if dirty list was not allocated yet. */ + unsigned di = 0, si = 0, npages = 1; + bool is_frozen = false, is_spilled = false, is_shadowed = false; + if (unlikely(!mp)) { + if (mdbx_assert_enabled() && pagetype) { + MDBX_page *check; + rc = mdbx_page_get(mc, pgno, &check, txn->mt_front); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + mdbx_tassert(txn, (PAGETYPE(check) & ~P_LEAF2) == (pagetype & ~P_FROZEN)); + mdbx_tassert(txn, !(pagetype & P_FROZEN) || IS_FROZEN(txn, check)); + } + if (pagetype & P_FROZEN) { + is_frozen = true; + if (mdbx_assert_enabled()) { + for (MDBX_txn *scan = txn; scan; scan = scan->mt_parent) { + mdbx_tassert(txn, + !scan->tw.spill_pages || + !mdbx_pnl_exist(scan->tw.spill_pages, pgno << 1)); + mdbx_tassert(txn, !scan->tw.dirtylist || !debug_dpl_find(scan, pgno)); + } + } + goto status_done; + } else if (pagetype && txn->tw.dirtylist) { + if ((di = mdbx_dpl_exist(txn, pgno)) != 0) { + mp = txn->tw.dirtylist->items[di].ptr; + mdbx_tassert(txn, IS_MODIFIABLE(txn, mp)); + goto status_done; + } + if (txn->tw.spill_pages && + (si = mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1)) != 0) { + is_spilled = true; + goto status_done; + } + for (MDBX_txn *parent = txn->mt_parent; parent; + parent = parent->mt_parent) { + if (mdbx_dpl_exist(parent, pgno)) { + is_shadowed = true; + goto status_done; + } + if (parent->tw.spill_pages && + mdbx_pnl_exist(parent->tw.spill_pages, pgno << 1)) { + is_spilled = true; + goto status_done; + } + } + is_frozen = true; + goto status_done; + } + + rc = mdbx_page_get(mc, pgno, &mp, txn->mt_front); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + mdbx_tassert(txn, !pagetype || PAGETYPE(mp) == pagetype); + pagetype = PAGETYPE(mp); + } + + is_frozen = IS_FROZEN(txn, mp); + if (!is_frozen) { + const bool is_dirty = IS_MODIFIABLE(txn, mp); + is_spilled = IS_SPILLED(txn, mp) && !(txn->mt_flags & MDBX_WRITEMAP); + is_shadowed = IS_SHADOWED(txn, mp); + if (is_dirty) { + mdbx_tassert(txn, !is_spilled); + mdbx_tassert(txn, !txn->tw.spill_pages || + !mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1)); + mdbx_tassert(txn, debug_dpl_find(txn, pgno) == mp || txn->mt_parent || + (txn->mt_flags & MDBX_WRITEMAP)); + } else { + mdbx_tassert(txn, !debug_dpl_find(txn, pgno)); + } + + di = is_dirty ? mdbx_dpl_exist(txn, pgno) : 0; + si = (is_spilled && txn->tw.spill_pages) + ? mdbx_pnl_exist(txn->tw.spill_pages, pgno << 1) + : 0; + mdbx_tassert(txn, !is_dirty || di || (txn->mt_flags & MDBX_WRITEMAP)); + } else { + mdbx_tassert(txn, !IS_MODIFIABLE(txn, mp)); + mdbx_tassert(txn, !IS_SPILLED(txn, mp)); + mdbx_tassert(txn, !IS_SHADOWED(txn, mp)); + } + +status_done: + if (likely((pagetype & P_OVERFLOW) == 0)) { + STATIC_ASSERT(P_BRANCH == 1); + const bool is_branch = pagetype & P_BRANCH; + if (unlikely(mc->mc_flags & C_SUB)) { + MDBX_db *outer = mdbx_outer_db(mc); + mdbx_cassert(mc, !is_branch || outer->md_branch_pages > 0); + outer->md_branch_pages -= is_branch; + mdbx_cassert(mc, is_branch || outer->md_leaf_pages > 0); + outer->md_leaf_pages -= 1 - is_branch; + } + mdbx_cassert(mc, !is_branch || mc->mc_db->md_branch_pages > 0); + mc->mc_db->md_branch_pages -= is_branch; + mdbx_cassert(mc, (pagetype & P_LEAF) == 0 || mc->mc_db->md_leaf_pages > 0); + mc->mc_db->md_leaf_pages -= (pagetype & P_LEAF) != 0; + } else { + npages = mp->mp_pages; + mdbx_cassert(mc, mc->mc_db->md_overflow_pages >= npages); + mc->mc_db->md_overflow_pages -= npages; + } + + if (is_frozen) { + retire: + mdbx_debug("retire %u page %" PRIaPGNO, npages, pgno); + rc = mdbx_pnl_append_range(false, &txn->tw.retired_pages, pgno, npages); + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + return rc; + } + + /* Возврат страниц в нераспределенный "хвост" БД. + * Содержимое страниц не уничтожается, а для вложенных транзакций граница + * нераспределенного "хвоста" БД сдвигается только при их коммите. */ + if (MDBX_ENABLE_REFUND && unlikely(pgno + npages == txn->mt_next_pgno)) { + const char *kind = nullptr; + if (di) { + /* Страница испачкана в этой транзакции, но до этого могла быть + * аллоцирована, испачкана и пролита в одной из родительских транзакций. + * Её МОЖНО вытолкнуть в нераспределенный хвост. */ + kind = "dirty"; + /* Remove from dirty list */ + mdbx_page_wash(txn, di, mp, npages); + } else if (si) { + /* Страница пролита в этой транзакции, т.е. она аллоцирована + * и запачкана в этой или одной из родительских транзакций. + * Её МОЖНО вытолкнуть в нераспределенный хвост. */ + kind = "spilled"; + mdbx_spill_remove(txn, si, npages); + } else if ((txn->mt_flags & MDBX_WRITEMAP)) { + kind = "writemap"; + mdbx_tassert(txn, mp && IS_MODIFIABLE(txn, mp)); + } else { + /* Страница аллоцирована, запачкана и возможно пролита в одной + * из родительских транзакций. + * Её МОЖНО вытолкнуть в нераспределенный хвост. */ + kind = "parent's"; + if (mdbx_assert_enabled() && mp) { + kind = nullptr; + for (MDBX_txn *parent = txn->mt_parent; parent; + parent = parent->mt_parent) { + if (parent->tw.spill_pages && + mdbx_pnl_exist(parent->tw.spill_pages, pgno << 1)) { + kind = "parent-spilled"; + mdbx_tassert(txn, is_spilled); + break; + } + if (mp == debug_dpl_find(parent, pgno)) { + kind = "parent-dirty"; + mdbx_tassert(txn, !is_spilled); + break; + } + } + mdbx_tassert(txn, kind != nullptr); + } + mdbx_tassert(txn, + is_spilled || is_shadowed || (mp && IS_SHADOWED(txn, mp))); + } + mdbx_debug("refunded %u %s page %" PRIaPGNO, npages, kind, pgno); + txn->mt_next_pgno = pgno; + mdbx_refund(txn); + return MDBX_SUCCESS; + } + + if (di) { + /* Dirty page from this transaction */ + /* If suitable we can reuse it through loose list */ + if (likely(npages == 1 && + txn->tw.loose_count < txn->mt_env->me_options.dp_loose_limit && + (!MDBX_ENABLE_REFUND || + /* skip pages near to the end in favor of compactification */ + txn->mt_next_pgno > + pgno + txn->mt_env->me_options.dp_loose_limit || + txn->mt_next_pgno <= txn->mt_env->me_options.dp_loose_limit))) { + mdbx_debug("loosen dirty page %" PRIaPGNO, pgno); + mp->mp_flags = P_LOOSE; + mp->mp_next = txn->tw.loose_pages; + txn->tw.loose_pages = mp; + txn->tw.loose_count++; +#if MDBX_ENABLE_REFUND + txn->tw.loose_refund_wl = (pgno + 2 > txn->tw.loose_refund_wl) + ? pgno + 2 + : txn->tw.loose_refund_wl; +#endif /* MDBX_ENABLE_REFUND */ + if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) + memset(page_data(mp), -1, txn->mt_env->me_psize - PAGEHDRSZ); + VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), + txn->mt_env->me_psize - PAGEHDRSZ); + ASAN_POISON_MEMORY_REGION(page_data(mp), + txn->mt_env->me_psize - PAGEHDRSZ); + return MDBX_SUCCESS; + } + +#if !MDBX_DEBUG && !defined(MDBX_USE_VALGRIND) && !defined(__SANITIZE_ADDRESS__) + if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) +#endif + { + /* Страница могла быть изменена в одной из родительских транзакций, + * в том числе, позже выгружена и затем снова загружена и изменена. + * В обоих случаях её нельзя затирать на диске и помечать недоступной + * в asan и/или valgrind */ + for (MDBX_txn *parent = txn->mt_parent; + parent && (parent->mt_flags & MDBX_TXN_SPILLS); + parent = parent->mt_parent) { + if (parent->tw.spill_pages && + mdbx_pnl_intersect(parent->tw.spill_pages, pgno << 1, npages << 1)) + goto skip_invalidate; + if (mdbx_dpl_intersect(parent, pgno, npages)) + goto skip_invalidate; + } + +#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) + if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) +#endif + mdbx_kill_page(txn, mp, pgno, npages); + if (!(txn->mt_flags & MDBX_WRITEMAP)) { + VALGRIND_MAKE_MEM_NOACCESS(page_data(pgno2page(txn->mt_env, pgno)), + pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); + ASAN_POISON_MEMORY_REGION(page_data(pgno2page(txn->mt_env, pgno)), + pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); + } + } + skip_invalidate: + /* Remove from dirty list */ + mdbx_page_wash(txn, di, mp, npages); + + reclaim: + mdbx_debug("reclaim %u %s page %" PRIaPGNO, npages, "dirty", pgno); + rc = mdbx_pnl_insert_range(&txn->tw.reclaimed_pglist, pgno, npages); + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + return rc; + } + + if (si) { + /* Page ws spilled in this txn */ + mdbx_spill_remove(txn, si, npages); + /* Страница могла быть выделена и затем пролита в этой транзакции, + * тогда её необходимо поместить в reclaimed-список. + * Либо она могла быть выделена в одной из родительских транзакций и затем + * пролита в этой транзакции, тогда её необходимо поместить в + * retired-список для последующей фильтрации при коммите. */ + for (MDBX_txn *parent = txn->mt_parent; parent; + parent = parent->mt_parent) { + if (mdbx_dpl_exist(parent, pgno)) + goto retire; + } + /* Страница точно была выделена в этой транзакции + * и теперь может быть использована повторно. */ + goto reclaim; + } + + if (is_shadowed) { + /* Dirty page MUST BE a clone from (one of) parent transaction(s). */ + if (mdbx_assert_enabled()) { + const MDBX_page *parent_dp = nullptr; + /* Check parent(s)'s dirty lists. */ + for (MDBX_txn *parent = txn->mt_parent; parent && !parent_dp; + parent = parent->mt_parent) { + mdbx_tassert(txn, + !parent->tw.spill_pages || + !mdbx_pnl_exist(parent->tw.spill_pages, pgno << 1)); + parent_dp = debug_dpl_find(parent, pgno); + } + mdbx_tassert(txn, parent_dp && (!mp || parent_dp == mp)); + } + /* Страница была выделена в родительской транзакции и теперь может быть + * использована повторно, но только внутри этой транзакции, либо дочерних. + */ + goto reclaim; + } + + /* Страница может входить в доступный читателям MVCC-снимок, либо же она + * могла быть выделена, а затем пролита в одной из родительских + * транзакций. Поэтому пока помещаем её в retired-список, который будет + * фильтроваться относительно dirty- и spilled-списков родительских + * транзакций при коммите + * дочерних транзакций, либо же будет записан в GC в неизменном виде. */ + goto retire; +} + +static __inline int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp) { + return mdbx_page_retire_ex(mc, mp->mp_pgno, mp, PAGETYPE(mp)); +} + +struct mdbx_iov_ctx { + unsigned iov_items; + size_t iov_bytes; + size_t iov_off; + pgno_t flush_begin; + pgno_t flush_end; + struct iovec iov[MDBX_COMMIT_PAGES]; +}; + +static __inline void mdbx_iov_init(MDBX_txn *const txn, + struct mdbx_iov_ctx *ctx) { + ctx->flush_begin = MAX_PAGENO; + ctx->flush_end = MIN_PAGENO; + ctx->iov_items = 0; + ctx->iov_bytes = 0; + ctx->iov_off = 0; + (void)txn; +} + +static __inline void mdbx_iov_done(MDBX_txn *const txn, + struct mdbx_iov_ctx *ctx) { + mdbx_tassert(txn, ctx->iov_items == 0); +#if defined(__linux__) || defined(__gnu_linux__) + MDBX_env *const env = txn->mt_env; + if (!(txn->mt_flags & MDBX_WRITEMAP) && + mdbx_linux_kernel_version < 0x02060b00) + /* Linux kernels older than version 2.6.11 ignore the addr and nbytes + * arguments, making this function fairly expensive. Therefore, the + * whole cache is always flushed. */ + mdbx_flush_incoherent_mmap( + env->me_map + pgno2bytes(env, ctx->flush_begin), + pgno2bytes(env, ctx->flush_end - ctx->flush_begin), env->me_os_psize); +#endif /* Linux */ +} + +static int mdbx_iov_write(MDBX_txn *const txn, struct mdbx_iov_ctx *ctx) { + mdbx_tassert(txn, !(txn->mt_flags & MDBX_WRITEMAP)); + mdbx_tassert(txn, ctx->iov_items > 0); + + MDBX_env *const env = txn->mt_env; + int rc; + if (likely(ctx->iov_items == 1)) { + mdbx_assert(env, ctx->iov_bytes == (size_t)ctx->iov[0].iov_len); + rc = mdbx_pwrite(env->me_lazy_fd, ctx->iov[0].iov_base, ctx->iov[0].iov_len, + ctx->iov_off); + } else { + rc = mdbx_pwritev(env->me_lazy_fd, ctx->iov, ctx->iov_items, ctx->iov_off, + ctx->iov_bytes); + } + + if (unlikely(rc != MDBX_SUCCESS)) + mdbx_error("Write error: %s", mdbx_strerror(rc)); + else { + VALGRIND_MAKE_MEM_DEFINED(txn->mt_env->me_map + ctx->iov_off, + ctx->iov_bytes); + ASAN_UNPOISON_MEMORY_REGION(txn->mt_env->me_map + ctx->iov_off, + ctx->iov_bytes); + } + + for (unsigned i = 0; i < ctx->iov_items; i++) + mdbx_dpage_free(env, (MDBX_page *)ctx->iov[i].iov_base, + bytes2pgno(env, ctx->iov[i].iov_len)); + +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&txn->mt_env->me_lck->mti_pgop_stat.wops, ctx->iov_items); +#endif /* MDBX_ENABLE_PGOP_STAT */ + ctx->iov_items = 0; + ctx->iov_bytes = 0; + return rc; +} + +static int iov_page(MDBX_txn *txn, struct mdbx_iov_ctx *ctx, MDBX_page *dp, + unsigned npages) { + MDBX_env *const env = txn->mt_env; + mdbx_tassert(txn, + dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno); + mdbx_tassert(txn, IS_MODIFIABLE(txn, dp)); + mdbx_tassert(txn, + !(dp->mp_flags & ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW))); + + ctx->flush_begin = + (ctx->flush_begin < dp->mp_pgno) ? ctx->flush_begin : dp->mp_pgno; + ctx->flush_end = (ctx->flush_end > dp->mp_pgno + npages) + ? ctx->flush_end + : dp->mp_pgno + npages; + env->me_lck->mti_unsynced_pages.weak += npages; + + if (IS_SHADOWED(txn, dp)) { + mdbx_tassert(txn, !(txn->mt_flags & MDBX_WRITEMAP)); + dp->mp_txnid = txn->mt_txnid; + mdbx_tassert(txn, IS_SPILLED(txn, dp)); + const size_t size = pgno2bytes(env, npages); + if (ctx->iov_off + ctx->iov_bytes != pgno2bytes(env, dp->mp_pgno) || + ctx->iov_items == ARRAY_LENGTH(ctx->iov) || + ctx->iov_bytes + size > MAX_WRITE) { + if (ctx->iov_items) { + int err = mdbx_iov_write(txn, ctx); + if (unlikely(err != MDBX_SUCCESS)) + return err; +#if defined(__linux__) || defined(__gnu_linux__) + if (mdbx_linux_kernel_version >= 0x02060b00) + /* Linux kernels older than version 2.6.11 ignore the addr and nbytes + * arguments, making this function fairly expensive. Therefore, the + * whole cache is always flushed. */ +#endif /* Linux */ + mdbx_flush_incoherent_mmap(env->me_map + ctx->iov_off, ctx->iov_bytes, + env->me_os_psize); + } + ctx->iov_off = pgno2bytes(env, dp->mp_pgno); + } + ctx->iov[ctx->iov_items].iov_base = (void *)dp; + ctx->iov[ctx->iov_items].iov_len = size; + ctx->iov_items += 1; + ctx->iov_bytes += size; + } else { + mdbx_tassert(txn, txn->mt_flags & MDBX_WRITEMAP); + } + return MDBX_SUCCESS; +} + +static int spill_page(MDBX_txn *txn, struct mdbx_iov_ctx *ctx, MDBX_page *dp, + unsigned npages) { + mdbx_tassert(txn, !(txn->mt_flags & MDBX_WRITEMAP)); + pgno_t pgno = dp->mp_pgno; + int err = iov_page(txn, ctx, dp, npages); + if (likely(err == MDBX_SUCCESS)) { + err = mdbx_pnl_append_range(true, &txn->tw.spill_pages, pgno << 1, npages); +#if MDBX_ENABLE_PGOP_STAT + if (likely(err == MDBX_SUCCESS)) + safe64_inc(&txn->mt_env->me_lck->mti_pgop_stat.spill, npages); +#endif /* MDBX_ENABLE_PGOP_STAT */ + } + return err; +} + +/* Set unspillable LRU-label for dirty pages watched by txn. + * Returns the number of pages marked as unspillable. */ +static unsigned mdbx_cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { + unsigned keep = 0; + while (mc->mc_flags & C_INITIALIZED) { + for (unsigned i = 0; i < mc->mc_snum; ++i) { + const MDBX_page *mp = mc->mc_pg[i]; + if (IS_MODIFIABLE(txn, mp) && !IS_SUBP(mp)) { + unsigned const n = mdbx_dpl_search(txn, mp->mp_pgno); + if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && + mdbx_dpl_age(txn, n)) { + txn->tw.dirtylist->items[n].lru = txn->tw.dirtylru; + ++keep; + } + } + } + if (!mc->mc_xcursor) + break; + mc = &mc->mc_xcursor->mx_cursor; + } + return keep; +} + +static unsigned mdbx_txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { + unsigned keep = m0 ? mdbx_cursor_keep(txn, m0) : 0; + for (unsigned i = FREE_DBI; i < txn->mt_numdbs; ++i) + if (F_ISSET(txn->mt_dbistate[i], DBI_DIRTY | DBI_VALID) && + txn->mt_dbs[i].md_root != P_INVALID) + for (MDBX_cursor *mc = txn->tw.cursors[i]; mc; mc = mc->mc_next) + if (mc != m0) + keep += mdbx_cursor_keep(txn, mc); + return keep; +} + +/* Returns the spilling priority (0..255) for a dirty page: + * 0 = should be spilled; + * ... + * > 255 = must not be spilled. */ +static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, + const uint32_t reciprocal) { + MDBX_dpl *const dl = txn->tw.dirtylist; + const uint32_t age = mdbx_dpl_age(txn, i); + const unsigned npages = dpl_npages(dl, i); + const pgno_t pgno = dl->items[i].pgno; + if (age == 0) { + mdbx_debug("skip %s %u page %" PRIaPGNO, "keep", npages, pgno); + return 256; + } + + MDBX_page *const dp = dl->items[i].ptr; + if (dp->mp_flags & (P_LOOSE | P_SPILLED)) { + mdbx_debug("skip %s %u page %" PRIaPGNO, + (dp->mp_flags & P_LOOSE) + ? "loose" + : (dp->mp_flags & P_LOOSE) ? "loose" : "parent-spilled", + npages, pgno); + return 256; + } + + /* Can't spill twice, + * make sure it's not already in a parent's spill list(s). */ + MDBX_txn *parent = txn->mt_parent; + if (parent && (parent->mt_flags & MDBX_TXN_SPILLS)) { + do + if (parent->tw.spill_pages && + mdbx_pnl_intersect(parent->tw.spill_pages, pgno << 1, npages << 1)) { + mdbx_debug("skip-2 parent-spilled %u page %" PRIaPGNO, npages, pgno); + dp->mp_flags |= P_SPILLED; + return 256; + } + while ((parent = parent->mt_parent) != nullptr); + } + + mdbx_tassert(txn, age * (uint64_t)reciprocal < UINT32_MAX); + unsigned prio = age * reciprocal >> 24; + mdbx_tassert(txn, prio < 256); + if (likely(npages == 1)) + return prio = 256 - prio; + + /* make a large/overflow pages be likely to spill */ + uint32_t factor = npages | npages >> 1; + factor |= factor >> 2; + factor |= factor >> 4; + factor |= factor >> 8; + factor |= factor >> 16; + factor = prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157; + factor = (factor < 256) ? 255 - factor : 0; + mdbx_tassert(txn, factor < 256 && factor < (256 - prio)); + return prio = factor; +} + +/* Spill pages from the dirty list back to disk. + * This is intended to prevent running into MDBX_TXN_FULL situations, + * but note that they may still occur in a few cases: + * + * 1) our estimate of the txn size could be too small. Currently this + * seems unlikely, except with a large number of MDBX_MULTIPLE items. + * + * 2) child txns may run out of space if their parents dirtied a + * lot of pages and never spilled them. TODO: we probably should do + * a preemptive spill during mdbx_txn_begin() of a child txn, if + * the parent's dirtyroom is below a given threshold. + * + * Otherwise, if not using nested txns, it is expected that apps will + * not run into MDBX_TXN_FULL any more. The pages are flushed to disk + * the same way as for a txn commit, e.g. their dirty status is cleared. + * If the txn never references them again, they can be left alone. + * If the txn only reads them, they can be used without any fuss. + * If the txn writes them again, they can be dirtied immediately without + * going thru all of the work of mdbx_page_touch(). Such references are + * handled by mdbx_page_unspill(). + * + * Also note, we never spill DB root pages, nor pages of active cursors, + * because we'll need these back again soon anyway. And in nested txns, + * we can't spill a page in a child txn if it was already spilled in a + * parent txn. That would alter the parent txns' data even though + * the child hasn't committed yet, and we'd have no way to undo it if + * the child aborted. */ +static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, + const unsigned need) { +#if xMDBX_DEBUG_SPILLING != 1 + /* production mode */ + if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need)) + return MDBX_SUCCESS; + unsigned wanna_spill = need - txn->tw.dirtyroom; +#else + /* debug mode: spill at least one page if xMDBX_DEBUG_SPILLING == 1 */ + unsigned wanna_spill = + (need > txn->tw.dirtyroom) ? need - txn->tw.dirtyroom : 1; +#endif /* xMDBX_DEBUG_SPILLING */ + + const unsigned dirty = txn->tw.dirtylist->length; + const unsigned spill_min = + txn->mt_env->me_options.spill_min_denominator + ? dirty / txn->mt_env->me_options.spill_min_denominator + : 0; + const unsigned spill_max = + dirty - (txn->mt_env->me_options.spill_max_denominator + ? dirty / txn->mt_env->me_options.spill_max_denominator + : 0); + wanna_spill = (wanna_spill > spill_min) ? wanna_spill : spill_min; + wanna_spill = (wanna_spill < spill_max) ? wanna_spill : spill_max; + if (!wanna_spill) + return MDBX_SUCCESS; + + mdbx_notice("spilling %u dirty-entries (have %u dirty-room, need %u)", + wanna_spill, txn->tw.dirtyroom, need); + mdbx_tassert(txn, txn->tw.dirtylist->length >= wanna_spill); + + struct mdbx_iov_ctx ctx; + mdbx_iov_init(txn, &ctx); + int rc = MDBX_SUCCESS; + if (txn->mt_flags & MDBX_WRITEMAP) { + MDBX_dpl *const dl = txn->tw.dirtylist; + const unsigned span = dl->length - txn->tw.loose_count; + txn->tw.dirtyroom += span; + unsigned r, w; + for (w = 0, r = 1; r <= dl->length; ++r) { + MDBX_page *dp = dl->items[r].ptr; + if (dp->mp_flags & P_LOOSE) + dl->items[++w] = dl->items[r]; + else if (!MDBX_FAKE_SPILL_WRITEMAP) { + rc = iov_page(txn, &ctx, dp, dpl_npages(dl, r)); + mdbx_tassert(txn, rc == MDBX_SUCCESS); + } + } + + mdbx_tassert(txn, span == r - 1 - w && w == txn->tw.loose_count); + dl->sorted = (dl->sorted == dl->length) ? w : 0; + dpl_setlen(dl, w); + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + + if (!MDBX_FAKE_SPILL_WRITEMAP && ctx.flush_end > ctx.flush_begin) { + MDBX_env *const env = txn->mt_env; +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = mdbx_msync(&env->me_dxb_mmap, + pgno_align2os_bytes(env, ctx.flush_begin), + pgno_align2os_bytes(env, ctx.flush_end - ctx.flush_begin), + MDBX_SYNC_NONE); + } + return rc; + } + + mdbx_tassert(txn, !(txn->mt_flags & MDBX_WRITEMAP)); + if (!txn->tw.spill_pages) { + txn->tw.spill_least_removed = INT_MAX; + txn->tw.spill_pages = mdbx_pnl_alloc(wanna_spill); + if (unlikely(!txn->tw.spill_pages)) { + rc = MDBX_ENOMEM; + bailout: + txn->mt_flags |= MDBX_TXN_ERROR; + return rc; + } + } else { + /* purge deleted slots */ + mdbx_spill_purge(txn); + rc = mdbx_pnl_reserve(&txn->tw.spill_pages, wanna_spill); + (void)rc /* ignore since the resulting list may be shorter + and mdbx_pnl_append() will increase pnl on demand */ + ; + } + + /* Сортируем чтобы запись на диск была полее последовательна */ + MDBX_dpl *const dl = mdbx_dpl_sort(txn); + + /* Preserve pages which may soon be dirtied again */ + const unsigned unspillable = mdbx_txn_keep(txn, m0); + if (unspillable + txn->tw.loose_count >= dl->length) { +#if xMDBX_DEBUG_SPILLING == 1 /* avoid false failure in debug mode */ + if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need)) + return MDBX_SUCCESS; +#endif /* xMDBX_DEBUG_SPILLING */ + mdbx_error("all %u dirty pages are unspillable since referenced " + "by a cursor(s), use fewer cursors or increase " + "MDBX_opt_txn_dp_limit", + unspillable); + return MDBX_TXN_FULL; + } + + /* Подзадача: Вытолкнуть часть страниц на диск в соответствии с LRU, + * но при этом учесть важные поправки: + * - лучше выталкивать старые large/overflow страницы, так будет освобождено + * больше памяти, а также так как они (в текущем понимании) гораздо реже + * повторно изменяются; + * - при прочих равных лучше выталкивать смежные страницы, так будет + * меньше I/O операций; + * - желательно потратить на это меньше времени чем std::partial_sort_copy; + * + * Решение: + * - Квантуем весь диапазон lru-меток до 256 значений и задействуем один + * проход 8-битного radix-sort. В результате получаем 256 уровней + * "свежести", в том числе значение lru-метки, старее которой страницы + * должны быть выгружены; + * - Двигаемся последовательно в сторону увеличения номеров страниц + * и выталкиваем страницы с lru-меткой старее отсекающего значения, + * пока не вытолкнем достаточно; + * - Встречая страницы смежные с выталкиваемыми для уменьшения кол-ва + * I/O операций выталкиваем и их, если они попадают в первую половину + * между выталкиваемыми и с самыми свежими lru-метками; + * - дополнительно при сортировке умышленно старим large/overflow страницы, + * тем самым повышая их шансы на выталкивание. */ + + /* get min/max of LRU-labels */ + uint32_t age_max = 0; + for (unsigned i = 1; i <= dl->length; ++i) { + const uint32_t age = mdbx_dpl_age(txn, i); + age_max = (age_max >= age) ? age_max : age; + } + + mdbx_verbose("lru-head %u, age-max %u", txn->tw.dirtylru, age_max); + + /* half of 8-bit radix-sort */ + unsigned radix_counters[256], spillable = 0, spilled = 0; + memset(&radix_counters, 0, sizeof(radix_counters)); + const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1); + for (unsigned i = 1; i <= dl->length; ++i) { + unsigned prio = spill_prio(txn, i, reciprocal); + if (prio < 256) { + radix_counters[prio] += 1; + spillable += 1; + } + } + + if (likely(spillable > 0)) { + unsigned prio2spill = 0, prio2adjacent = 128, amount = radix_counters[0]; + for (unsigned i = 1; i < 256; i++) { + if (amount < wanna_spill) { + prio2spill = i; + prio2adjacent = i + (257 - i) / 2; + amount += radix_counters[i]; + } else if (amount + amount < spillable + wanna_spill + /* РАВНОЗНАЧНО: amount - wanna_spill < spillable - amount */) { + prio2adjacent = i; + amount += radix_counters[i]; + } else + break; + } + + mdbx_verbose("prio2spill %u, prio2adjacent %u, amount %u, spillable %u, " + "wanna_spill %u", + prio2spill, prio2adjacent, amount, spillable, wanna_spill); + mdbx_tassert(txn, prio2spill < prio2adjacent && prio2adjacent <= 256); + + unsigned prev_prio = 256; + unsigned r, w, prio; + for (w = 0, r = 1; r <= dl->length && spilled < wanna_spill; + prev_prio = prio, ++r) { + prio = spill_prio(txn, r, reciprocal); + MDBX_page *const dp = dl->items[r].ptr; + if (prio < prio2adjacent) { + const pgno_t pgno = dl->items[r].pgno; + const unsigned npages = dpl_npages(dl, r); + if (prio <= prio2spill) { + if (prev_prio < prio2adjacent && prev_prio > prio2spill && + dpl_endpgno(dl, r - 1) == pgno) { + mdbx_debug("co-spill %u prev-adjacent page %" PRIaPGNO + " (age %d, prio %u)", + dpl_npages(dl, w), dl->items[r - 1].pgno, + mdbx_dpl_age(txn, r - 1), prev_prio); + --w; + rc = spill_page(txn, &ctx, dl->items[r - 1].ptr, + dpl_npages(dl, r - 1)); + if (unlikely(rc != MDBX_SUCCESS)) + break; + ++spilled; + } + + mdbx_debug("spill %u page %" PRIaPGNO " (age %d, prio %u)", npages, + dp->mp_pgno, mdbx_dpl_age(txn, r), prio); + rc = spill_page(txn, &ctx, dp, npages); + if (unlikely(rc != MDBX_SUCCESS)) + break; + ++spilled; + continue; + } + + if (prev_prio <= prio2spill && dpl_endpgno(dl, r - 1) == pgno) { + mdbx_debug("co-spill %u next-adjacent page %" PRIaPGNO + " (age %d, prio %u)", + npages, dp->mp_pgno, mdbx_dpl_age(txn, r), prio); + rc = spill_page(txn, &ctx, dp, npages); + if (unlikely(rc != MDBX_SUCCESS)) + break; + prio = prev_prio /* to continue co-spilling next adjacent pages */; + ++spilled; + continue; + } + } + dl->items[++w] = dl->items[r]; + } + + mdbx_tassert(txn, spillable == 0 || spilled > 0); + + while (r <= dl->length) + dl->items[++w] = dl->items[r++]; + mdbx_tassert(txn, r - 1 - w == spilled); + + dl->sorted = dpl_setlen(dl, w); + txn->tw.dirtyroom += spilled; + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + + if (ctx.iov_items) + rc = mdbx_iov_write(txn, &ctx); + + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + mdbx_pnl_sort(txn->tw.spill_pages); + txn->mt_flags |= MDBX_TXN_SPILLS; + mdbx_notice("spilled %u dirty-entries, now have %u dirty-room", spilled, + txn->tw.dirtyroom); + mdbx_iov_done(txn, &ctx); + } else { + mdbx_tassert(txn, ctx.iov_items == 0 && rc == MDBX_SUCCESS); + for (unsigned i = 1; i <= dl->length; ++i) { + MDBX_page *dp = dl->items[i].ptr; + mdbx_notice( + "dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", i, + dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, mdbx_dpl_age(txn, i), + spill_prio(txn, i, reciprocal)); + } + } + +#if xMDBX_DEBUG_SPILLING == 2 + if (txn->tw.loose_count + txn->tw.dirtyroom <= need / 2 + 1) + mdbx_error("dirty-list length: before %u, after %u, parent %i, loose %u; " + "needed %u, spillable %u; " + "spilled %u dirty-entries, now have %u dirty-room", + dl->length + spilled, dl->length, + (txn->mt_parent && txn->mt_parent->tw.dirtylist) + ? (int)txn->mt_parent->tw.dirtylist->length + : -1, + txn->tw.loose_count, need, spillable, spilled, + txn->tw.dirtyroom); + mdbx_ensure(txn->mt_env, txn->tw.loose_count + txn->tw.dirtyroom > need / 2); +#endif /* xMDBX_DEBUG_SPILLING */ + + return likely(txn->tw.loose_count + txn->tw.dirtyroom > need / 2) + ? MDBX_SUCCESS + : MDBX_TXN_FULL; +} + +static int mdbx_cursor_spill(MDBX_cursor *mc, const MDBX_val *key, + const MDBX_val *data) { + MDBX_txn *txn = mc->mc_txn; + /* Estimate how much space this operation will take: */ + /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */ + unsigned need = CURSOR_STACK + 3; + /* 2) GC/FreeDB for any payload */ + if (mc->mc_dbi > FREE_DBI) { + need += txn->mt_dbs[FREE_DBI].md_depth + 3; + /* 3) Named DBs also dirty the main DB */ + if (mc->mc_dbi > MAIN_DBI) + need += txn->mt_dbs[MAIN_DBI].md_depth + 3; + } +#if xMDBX_DEBUG_SPILLING != 2 + /* production mode */ + /* 4) Double the page chain estimation + * for extensively splitting, rebalance and merging */ + need += need; + /* 5) Factor the key+data which to be put in */ + need += bytes2pgno(txn->mt_env, node_size(key, data)) + 1; +#else + /* debug mode */ + (void)key; + (void)data; + mc->mc_txn->mt_env->debug_dirtied_est = ++need; + mc->mc_txn->mt_env->debug_dirtied_act = 0; +#endif /* xMDBX_DEBUG_SPILLING == 2 */ + + return mdbx_txn_spill(txn, mc, need); +} + +/*----------------------------------------------------------------------------*/ + +static __always_inline bool meta_bootid_match(const MDBX_meta *meta) { + return memcmp(&meta->mm_bootid, &bootid, 16) == 0 && + (bootid.x | bootid.y) != 0; +} + +static bool meta_weak_acceptable(const MDBX_env *env, const MDBX_meta *meta, + const int lck_exclusive) { + return lck_exclusive + ? /* exclusive lock */ meta_bootid_match(meta) + : /* db already opened */ env->me_lck_mmap.lck && + (env->me_lck_mmap.lck->mti_envmode.weak & MDBX_RDONLY) == 0; +} + +#define METAPAGE(env, n) page_meta(pgno2page(env, n)) +#define METAPAGE_END(env) METAPAGE(env, NUM_METAS) + +static __inline txnid_t meta_txnid(const MDBX_env *env, const MDBX_meta *meta, + const bool allow_volatile) { + mdbx_memory_fence(mo_AcquireRelease, false); + txnid_t a = unaligned_peek_u64(4, &meta->mm_txnid_a); + txnid_t b = unaligned_peek_u64(4, &meta->mm_txnid_b); + if (allow_volatile) + return (a == b) ? a : 0; + mdbx_assert(env, a == b); + return a; +} + +static __inline txnid_t mdbx_meta_txnid_stable(const MDBX_env *env, + const MDBX_meta *meta) { + return meta_txnid(env, meta, false); +} + +static __inline txnid_t mdbx_meta_txnid_fluid(const MDBX_env *env, + const MDBX_meta *meta) { + return meta_txnid(env, meta, true); +} + +static __inline void mdbx_meta_update_begin(const MDBX_env *env, + MDBX_meta *meta, txnid_t txnid) { + mdbx_assert(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); + mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_a) < txnid && + unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); + (void)env; + unaligned_poke_u64(4, meta->mm_txnid_b, 0); + mdbx_memory_fence(mo_AcquireRelease, true); + unaligned_poke_u64(4, meta->mm_txnid_a, txnid); +} + +static __inline void mdbx_meta_update_end(const MDBX_env *env, MDBX_meta *meta, + txnid_t txnid) { + mdbx_assert(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); + mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_a) == txnid); + mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); + (void)env; + mdbx_jitter4testing(true); + memcpy(&meta->mm_bootid, &bootid, 16); + unaligned_poke_u64(4, meta->mm_txnid_b, txnid); + mdbx_memory_fence(mo_AcquireRelease, true); +} + +static __inline void mdbx_meta_set_txnid(const MDBX_env *env, MDBX_meta *meta, + txnid_t txnid) { + mdbx_assert(env, !env->me_map || meta < METAPAGE(env, 0) || + meta >= METAPAGE_END(env)); + (void)env; + /* update inconsistent since this function used ONLY for filling meta-image + * for writing, but not the actual meta-page */ + memcpy(&meta->mm_bootid, &bootid, 16); + unaligned_poke_u64(4, meta->mm_txnid_a, txnid); + unaligned_poke_u64(4, meta->mm_txnid_b, txnid); +} + +static __inline uint64_t mdbx_meta_sign(const MDBX_meta *meta) { + uint64_t sign = MDBX_DATASIGN_NONE; +#if 0 /* TODO */ + sign = hippeus_hash64(...); +#else + (void)meta; +#endif + /* LY: newer returns MDBX_DATASIGN_NONE or MDBX_DATASIGN_WEAK */ + return (sign > MDBX_DATASIGN_WEAK) ? sign : ~sign; +} + +enum meta_choise_mode { prefer_last, prefer_steady }; + +static __inline bool mdbx_meta_ot(const enum meta_choise_mode mode, + const MDBX_env *env, const MDBX_meta *a, + const MDBX_meta *b) { + mdbx_jitter4testing(true); + txnid_t txnid_a = mdbx_meta_txnid_fluid(env, a); + txnid_t txnid_b = mdbx_meta_txnid_fluid(env, b); + + mdbx_jitter4testing(true); + switch (mode) { + default: + assert(false); + __unreachable(); + /* fall through */ + __fallthrough; + case prefer_steady: + if (META_IS_STEADY(a) != META_IS_STEADY(b)) + return META_IS_STEADY(b); + /* fall through */ + __fallthrough; + case prefer_last: + mdbx_jitter4testing(true); + if (txnid_a == txnid_b) + return META_IS_STEADY(b); + return txnid_a < txnid_b; + } +} + +static __inline bool mdbx_meta_eq(const MDBX_env *env, const MDBX_meta *a, + const MDBX_meta *b) { + mdbx_jitter4testing(true); + const txnid_t txnid = mdbx_meta_txnid_fluid(env, a); + if (!txnid || txnid != mdbx_meta_txnid_fluid(env, b)) + return false; + + mdbx_jitter4testing(true); + if (META_IS_STEADY(a) != META_IS_STEADY(b)) + return false; + + mdbx_jitter4testing(true); + return true; +} + +static int mdbx_meta_eq_mask(const MDBX_env *env) { + MDBX_meta *m0 = METAPAGE(env, 0); + MDBX_meta *m1 = METAPAGE(env, 1); + MDBX_meta *m2 = METAPAGE(env, 2); + + int rc = mdbx_meta_eq(env, m0, m1) ? 1 : 0; + if (mdbx_meta_eq(env, m1, m2)) + rc += 2; + if (mdbx_meta_eq(env, m2, m0)) + rc += 4; + return rc; +} + +static __inline MDBX_meta *mdbx_meta_recent(const enum meta_choise_mode mode, + const MDBX_env *env, MDBX_meta *a, + MDBX_meta *b) { + const bool a_older_that_b = mdbx_meta_ot(mode, env, a, b); + mdbx_assert(env, !mdbx_meta_eq(env, a, b)); + return a_older_that_b ? b : a; +} + +static __inline MDBX_meta *mdbx_meta_ancient(const enum meta_choise_mode mode, + const MDBX_env *env, MDBX_meta *a, + MDBX_meta *b) { + const bool a_older_that_b = mdbx_meta_ot(mode, env, a, b); + mdbx_assert(env, !mdbx_meta_eq(env, a, b)); + return a_older_that_b ? a : b; +} + +static __inline MDBX_meta * +mdbx_meta_mostrecent(const enum meta_choise_mode mode, const MDBX_env *env) { + MDBX_meta *m0 = METAPAGE(env, 0); + MDBX_meta *m1 = METAPAGE(env, 1); + MDBX_meta *m2 = METAPAGE(env, 2); + + MDBX_meta *head = mdbx_meta_recent(mode, env, m0, m1); + head = mdbx_meta_recent(mode, env, head, m2); + return head; +} + +static MDBX_meta *mdbx_meta_steady(const MDBX_env *env) { + return mdbx_meta_mostrecent(prefer_steady, env); +} + +static MDBX_meta *mdbx_meta_head(const MDBX_env *env) { + return mdbx_meta_mostrecent(prefer_last, env); +} + +static txnid_t mdbx_recent_committed_txnid(const MDBX_env *env) { + while (true) { + const MDBX_meta *head = mdbx_meta_head(env); + const txnid_t recent = mdbx_meta_txnid_fluid(env, head); + mdbx_compiler_barrier(); + if (likely(head == mdbx_meta_head(env) && + recent == mdbx_meta_txnid_fluid(env, head))) + return recent; + } +} + +static txnid_t mdbx_recent_steady_txnid(const MDBX_env *env) { + while (true) { + const MDBX_meta *head = mdbx_meta_steady(env); + const txnid_t recent = mdbx_meta_txnid_fluid(env, head); + mdbx_compiler_barrier(); + if (likely(head == mdbx_meta_steady(env) && + recent == mdbx_meta_txnid_fluid(env, head))) + return recent; + } +} + +static const char *mdbx_durable_str(const MDBX_meta *const meta) { + if (META_IS_STEADY(meta)) + return (unaligned_peek_u64(4, meta->mm_datasync_sign) == + mdbx_meta_sign(meta)) + ? "Steady" + : "Tainted"; + return "Weak"; +} + +/*----------------------------------------------------------------------------*/ + +/* Find oldest txnid still referenced. */ +static txnid_t mdbx_find_oldest(const MDBX_txn *txn) { + mdbx_tassert(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + MDBX_env *env = txn->mt_env; + const txnid_t edge = mdbx_recent_steady_txnid(env); + mdbx_tassert(txn, edge <= txn->mt_txnid); + + MDBX_lockinfo *const lck = env->me_lck_mmap.lck; + if (unlikely(lck == NULL /* exclusive mode */)) + return atomic_store64(&lck->mti_oldest_reader, edge, mo_Relaxed); + + const txnid_t last_oldest = + atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease); + mdbx_tassert(txn, edge >= last_oldest); + if (likely(last_oldest == edge)) + return edge; + + const uint32_t nothing_changed = MDBX_STRING_TETRAD("None"); + const uint32_t snap_readers_refresh_flag = + atomic_load32(&lck->mti_readers_refresh_flag, mo_AcquireRelease); + mdbx_jitter4testing(false); + if (snap_readers_refresh_flag == nothing_changed) + return last_oldest; + + txnid_t oldest = edge; + atomic_store32(&lck->mti_readers_refresh_flag, nothing_changed, mo_Relaxed); + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); + for (unsigned i = 0; i < snap_nreaders; ++i) { + if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { + /* mdbx_jitter4testing(true); */ + const txnid_t snap = safe64_read(&lck->mti_readers[i].mr_txnid); + if (oldest > snap && last_oldest <= /* ignore pending updates */ snap) { + oldest = snap; + if (oldest == last_oldest) + return oldest; + } + } + } + + if (oldest != last_oldest) { + mdbx_notice("update oldest %" PRIaTXN " -> %" PRIaTXN, last_oldest, oldest); + mdbx_tassert(txn, oldest >= lck->mti_oldest_reader.weak); + atomic_store64(&lck->mti_oldest_reader, oldest, mo_Relaxed); + } + return oldest; +} + +/* Find largest mvcc-snapshot still referenced. */ +static __cold pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { + MDBX_lockinfo *const lck = env->me_lck_mmap.lck; + if (likely(lck != NULL /* exclusive mode */)) { + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); + for (unsigned i = 0; i < snap_nreaders; ++i) { + retry: + if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { + /* mdbx_jitter4testing(true); */ + const pgno_t snap_pages = atomic_load32( + &lck->mti_readers[i].mr_snapshot_pages_used, mo_Relaxed); + const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); + if (unlikely( + snap_pages != + atomic_load32(&lck->mti_readers[i].mr_snapshot_pages_used, + mo_AcquireRelease) || + snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) + goto retry; + if (largest < snap_pages && + atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease) <= + /* ignore pending updates */ snap_txnid && + snap_txnid <= env->me_txn0->mt_txnid) + largest = snap_pages; + } + } + } + + return largest; +} + +/* Add a page to the txn's dirty list */ +static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp, + unsigned npages) { +#if xMDBX_DEBUG_SPILLING == 2 + txn->mt_env->debug_dirtied_act += 1; + mdbx_ensure(txn->mt_env, + txn->mt_env->debug_dirtied_act < txn->mt_env->debug_dirtied_est); + mdbx_ensure(txn->mt_env, txn->tw.dirtyroom + txn->tw.loose_count > 0); +#endif /* xMDBX_DEBUG_SPILLING == 2 */ + + int rc; + mp->mp_txnid = txn->mt_front; + if (unlikely(txn->tw.dirtyroom == 0)) { + if (txn->tw.loose_count) { + MDBX_page *loose = txn->tw.loose_pages; + mdbx_debug("purge-and-reclaim loose page %" PRIaPGNO, loose->mp_pgno); + rc = mdbx_pnl_insert_range(&txn->tw.reclaimed_pglist, loose->mp_pgno, 1); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + unsigned di = mdbx_dpl_search(txn, loose->mp_pgno); + mdbx_tassert(txn, txn->tw.dirtylist->items[di].ptr == loose); + mdbx_dpl_remove(txn, di); + txn->tw.loose_pages = loose->mp_next; + txn->tw.loose_count--; + txn->tw.dirtyroom++; + if (!(txn->mt_flags & MDBX_WRITEMAP)) + mdbx_dpage_free(txn->mt_env, loose, 1); + } else { + mdbx_error("Dirtyroom is depleted, DPL length %u", + txn->tw.dirtylist->length); + if (!(txn->mt_flags & MDBX_WRITEMAP)) + mdbx_dpage_free(txn->mt_env, mp, npages); + return MDBX_TXN_FULL; + } + } + + rc = mdbx_dpl_append(txn, mp->mp_pgno, mp, npages); + if (unlikely(rc != MDBX_SUCCESS)) { + bailout: + txn->mt_flags |= MDBX_TXN_ERROR; + return rc; + } + txn->tw.dirtyroom--; + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + return MDBX_SUCCESS; +} + +#if !(defined(_WIN32) || defined(_WIN64)) +MDBX_MAYBE_UNUSED static __always_inline int ignore_enosys(int err) { +#ifdef ENOSYS + if (err == ENOSYS) + return MDBX_RESULT_TRUE; +#endif /* ENOSYS */ +#ifdef ENOIMPL + if (err == ENOIMPL) + return MDBX_RESULT_TRUE; +#endif /* ENOIMPL */ +#ifdef ENOTSUP + if (err == ENOTSUP) + return MDBX_RESULT_TRUE; +#endif /* ENOTSUP */ +#ifdef ENOSUPP + if (err == ENOSUPP) + return MDBX_RESULT_TRUE; +#endif /* ENOSUPP */ +#ifdef EOPNOTSUPP + if (err == EOPNOTSUPP) + return MDBX_RESULT_TRUE; +#endif /* EOPNOTSUPP */ + if (err == EAGAIN) + return MDBX_RESULT_TRUE; + return err; +} +#endif /* defined(_WIN32) || defined(_WIN64) */ + +#if MDBX_ENABLE_MADVISE +/* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ +static __cold int mdbx_set_readahead(MDBX_env *env, const pgno_t edge, + const bool enable, + const bool force_whole) { + mdbx_assert(env, edge >= NUM_METAS && edge <= MAX_PAGENO); + mdbx_assert(env, (enable & 1) == (enable != 0)); + const bool toggle = force_whole || + ((enable ^ env->me_lck->mti_readahead_anchor) & 1) || + !env->me_lck->mti_readahead_anchor; + const pgno_t prev_edge = env->me_lck->mti_readahead_anchor >> 1; + const size_t limit = env->me_dxb_mmap.limit; + size_t offset = + toggle ? 0 + : pgno_align2os_bytes(env, (prev_edge < edge) ? prev_edge : edge); + offset = (offset < limit) ? offset : limit; + + size_t length = + pgno_align2os_bytes(env, (prev_edge < edge) ? edge : prev_edge); + length = (length < limit) ? length : limit; + length -= offset; + + mdbx_assert(env, 0 <= (intptr_t)length); + if (length == 0) + return MDBX_SUCCESS; + + mdbx_notice("readahead %s %u..%u", enable ? "ON" : "OFF", + bytes2pgno(env, offset), bytes2pgno(env, offset + length)); + +#if defined(F_RDAHEAD) + if (toggle && unlikely(fcntl(env->me_lazy_fd, F_RDAHEAD, enable) == -1)) + return errno; +#endif /* F_RDAHEAD */ + + int err; + if (enable) { +#if defined(MADV_NORMAL) + err = madvise(env->me_map + offset, length, MADV_NORMAL) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(POSIX_MADV_NORMAL) + err = ignore_enosys( + posix_madvise(env->me_map + offset, length, POSIX_MADV_NORMAL)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(POSIX_FADV_NORMAL) && defined(POSIX_FADV_WILLNEED) + err = ignore_enosys( + posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_NORMAL)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(_WIN32) || defined(_WIN64) + /* no madvise on Windows */ +#else +#warning "FIXME" +#endif + if (toggle) { + /* NOTE: Seems there is a bug in the Mach/Darwin/OSX kernel, + * because MADV_WILLNEED with offset != 0 may cause SIGBUS + * on following access to the hinted region. + * 19.6.0 Darwin Kernel Version 19.6.0: Tue Jan 12 22:13:05 PST 2021; + * root:xnu-6153.141.16~1/RELEASE_X86_64 x86_64 */ +#if defined(F_RDADVISE) + struct radvisory hint; + hint.ra_offset = offset; + hint.ra_count = length; + (void)/* Ignore ENOTTY for DB on the ram-disk and so on */ fcntl( + env->me_lazy_fd, F_RDADVISE, &hint); +#elif defined(MADV_WILLNEED) + err = madvise(env->me_map + offset, length, MADV_WILLNEED) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(POSIX_MADV_WILLNEED) + err = ignore_enosys( + posix_madvise(env->me_map + offset, length, POSIX_MADV_WILLNEED)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(_WIN32) || defined(_WIN64) + if (mdbx_PrefetchVirtualMemory) { + WIN32_MEMORY_RANGE_ENTRY hint; + hint.VirtualAddress = env->me_map + offset; + hint.NumberOfBytes = length; + (void)mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0); + } +#elif defined(POSIX_FADV_WILLNEED) + err = ignore_enosys( + posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_WILLNEED)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#else +#warning "FIXME" +#endif + } + } else { +#if defined(MADV_RANDOM) + err = madvise(env->me_map + offset, length, MADV_RANDOM) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(POSIX_MADV_RANDOM) + err = ignore_enosys( + posix_madvise(env->me_map + offset, length, POSIX_MADV_RANDOM)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(POSIX_FADV_RANDOM) + err = ignore_enosys( + posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_RANDOM)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(_WIN32) || defined(_WIN64) + /* no madvise on Windows */ +#else +#warning "FIXME" +#endif /* MADV_RANDOM */ + } + + env->me_lck->mti_readahead_anchor = (enable & 1) + (edge << 1); + err = MDBX_SUCCESS; + return err; +} +#endif /* MDBX_ENABLE_MADVISE */ + +static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, + const pgno_t size_pgno, + const pgno_t limit_pgno, const bool implicit) { + const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno); + const size_t size_bytes = pgno_align2os_bytes(env, size_pgno); + const size_t prev_size = env->me_dxb_mmap.current; + const size_t prev_limit = env->me_dxb_mmap.limit; +#if MDBX_ENABLE_MADVISE || defined(MDBX_USE_VALGRIND) + const void *const prev_addr = env->me_map; +#endif /* MDBX_ENABLE_MADVISE || MDBX_USE_VALGRIND */ + + mdbx_verbose("resize datafile/mapping: " + "present %" PRIuPTR " -> %" PRIuPTR ", " + "limit %" PRIuPTR " -> %" PRIuPTR, + prev_size, size_bytes, prev_limit, limit_bytes); + + mdbx_assert(env, limit_bytes >= size_bytes); + mdbx_assert(env, bytes2pgno(env, size_bytes) >= size_pgno); + mdbx_assert(env, bytes2pgno(env, limit_bytes) >= limit_pgno); + + unsigned mresize_flags = + env->me_flags & (MDBX_RDONLY | MDBX_WRITEMAP | MDBX_UTTERLY_NOSYNC); +#if defined(_WIN32) || defined(_WIN64) + /* Acquire guard in exclusive mode for: + * - to avoid collision between read and write txns around env->me_dbgeo; + * - to avoid attachment of new reading threads (see mdbx_rdt_lock); */ + mdbx_srwlock_AcquireExclusive(&env->me_remap_guard); + mdbx_handle_array_t *suspended = NULL; + mdbx_handle_array_t array_onstack; + int rc = MDBX_SUCCESS; + if (limit_bytes == env->me_dxb_mmap.limit && + size_bytes == env->me_dxb_mmap.current && + size_bytes == env->me_dxb_mmap.filesize) + goto bailout; + + if ((env->me_flags & MDBX_NOTLS) == 0) { + /* 1) Windows allows only extending a read-write section, but not a + * corresponding mapped view. Therefore in other cases we must suspend + * the local threads for safe remap. + * 2) At least on Windows 10 1803 the entire mapped section is unavailable + * for short time during NtExtendSection() or VirtualAlloc() execution. + * 3) Under Wine runtime environment on Linux a section extending is not + * supported. + * + * THEREFORE LOCAL THREADS SUSPENDING IS ALWAYS REQUIRED! */ + array_onstack.limit = ARRAY_LENGTH(array_onstack.handles); + array_onstack.count = 0; + suspended = &array_onstack; + rc = mdbx_suspend_threads_before_remap(env, &suspended); + if (rc != MDBX_SUCCESS) { + mdbx_error("failed suspend-for-remap: errcode %d", rc); + goto bailout; + } + mresize_flags |= implicit ? MDBX_MRESIZE_MAY_UNMAP + : MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; + } +#else /* Windows */ + /* Acquire guard to avoid collision between read and write txns + * around env->me_dbgeo */ + int rc = mdbx_fastmutex_acquire(&env->me_remap_guard); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + if (limit_bytes == env->me_dxb_mmap.limit && + size_bytes == env->me_dxb_mmap.current) + goto bailout; + + MDBX_lockinfo *const lck = env->me_lck_mmap.lck; + if (limit_bytes != env->me_dxb_mmap.limit && !(env->me_flags & MDBX_NOTLS) && + lck && !implicit) { + int err = mdbx_rdt_lock(env) /* lock readers table until remap done */; + if (unlikely(MDBX_IS_ERROR(err))) { + rc = err; + goto bailout; + } + + /* looking for readers from this process */ + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); + mresize_flags |= implicit ? MDBX_MRESIZE_MAY_UNMAP + : MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; + for (unsigned i = 0; i < snap_nreaders; ++i) { + if (lck->mti_readers[i].mr_pid.weak == env->me_pid && + lck->mti_readers[i].mr_tid.weak != mdbx_thread_self()) { + /* the base address of the mapping can't be changed since + * the other reader thread from this process exists. */ + mdbx_rdt_unlock(env); + mresize_flags &= ~(MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE); + break; + } + } + } +#endif /* ! Windows */ + + if ((env->me_flags & MDBX_WRITEMAP) && env->me_lck->mti_unsynced_pages.weak) { +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), + MDBX_SYNC_NONE); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + +#if MDBX_ENABLE_MADVISE + if (size_bytes < prev_size) { + mdbx_notice("resize-MADV_%s %u..%u", + (env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED", + size_pgno, bytes2pgno(env, prev_size)); + rc = MDBX_RESULT_TRUE; +#if defined(MADV_REMOVE) + if (env->me_flags & MDBX_WRITEMAP) + rc = + madvise(env->me_map + size_bytes, prev_size - size_bytes, MADV_REMOVE) + ? ignore_enosys(errno) + : MDBX_SUCCESS; +#endif /* MADV_REMOVE */ +#if defined(MADV_DONTNEED) + if (rc == MDBX_RESULT_TRUE) + rc = madvise(env->me_map + size_bytes, prev_size - size_bytes, + MADV_DONTNEED) + ? ignore_enosys(errno) + : MDBX_SUCCESS; +#elif defined(POSIX_MADV_DONTNEED) + if (rc == MDBX_RESULT_TRUE) + rc = ignore_enosys(posix_madvise(env->me_map + size_bytes, + prev_size - size_bytes, + POSIX_MADV_DONTNEED)); +#elif defined(POSIX_FADV_DONTNEED) + if (rc == MDBX_RESULT_TRUE) + rc = ignore_enosys(posix_fadvise(env->me_lazy_fd, size_bytes, + prev_size - size_bytes, + POSIX_FADV_DONTNEED)); +#endif /* MADV_DONTNEED */ + if (unlikely(MDBX_IS_ERROR(rc))) + goto bailout; + if (env->me_lck->mti_discarded_tail.weak > size_pgno) + env->me_lck->mti_discarded_tail.weak = size_pgno; + } +#endif /* MDBX_ENABLE_MADVISE */ + + rc = mdbx_mresize(mresize_flags, &env->me_dxb_mmap, size_bytes, limit_bytes); + +#if MDBX_ENABLE_MADVISE + if (rc == MDBX_SUCCESS) { + env->me_lck->mti_discarded_tail.weak = size_pgno; + const bool readahead = + !(env->me_flags & MDBX_NORDAHEAD) && + mdbx_is_readahead_reasonable(size_bytes, -(intptr_t)prev_size); + const bool force = limit_bytes != prev_limit || + env->me_dxb_mmap.address != prev_addr +#if defined(_WIN32) || defined(_WIN64) + || prev_size > size_bytes +#endif /* Windows */ + ; + rc = mdbx_set_readahead(env, size_pgno, readahead, force); + } +#endif /* MDBX_ENABLE_MADVISE */ + +bailout: + if (rc == MDBX_SUCCESS) { +#if defined(_WIN32) || defined(_WIN64) + mdbx_assert(env, size_bytes == env->me_dxb_mmap.current); + mdbx_assert(env, size_bytes <= env->me_dxb_mmap.filesize); + mdbx_assert(env, limit_bytes == env->me_dxb_mmap.limit); +#endif /* Windows */ +#ifdef MDBX_USE_VALGRIND + if (prev_limit != env->me_dxb_mmap.limit || prev_addr != env->me_map) { + VALGRIND_DISCARD(env->me_valgrind_handle); + env->me_valgrind_handle = 0; + if (env->me_dxb_mmap.limit) + env->me_valgrind_handle = + VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx"); + } +#endif /* MDBX_USE_VALGRIND */ + } else { + if (rc != MDBX_UNABLE_EXTEND_MAPSIZE && rc != MDBX_RESULT_TRUE) { + mdbx_error("failed resize datafile/mapping: " + "present %" PRIuPTR " -> %" PRIuPTR ", " + "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", + prev_size, size_bytes, prev_limit, limit_bytes, rc); + } else { + mdbx_warning("unable resize datafile/mapping: " + "present %" PRIuPTR " -> %" PRIuPTR ", " + "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", + prev_size, size_bytes, prev_limit, limit_bytes, rc); + } + if (!env->me_dxb_mmap.address) { + env->me_flags |= MDBX_FATAL_ERROR; + if (env->me_txn) + env->me_txn->mt_flags |= MDBX_TXN_ERROR; + rc = MDBX_PANIC; + } + } + +#if defined(_WIN32) || defined(_WIN64) + int err = MDBX_SUCCESS; + mdbx_srwlock_ReleaseExclusive(&env->me_remap_guard); + if (suspended) { + err = mdbx_resume_threads_after_remap(suspended); + if (suspended != &array_onstack) + mdbx_free(suspended); + } +#else + if (env->me_lck_mmap.lck && + (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) != 0) + mdbx_rdt_unlock(env); + int err = mdbx_fastmutex_release(&env->me_remap_guard); +#endif /* Windows */ + if (err != MDBX_SUCCESS) { + mdbx_fatal("failed resume-after-remap: errcode %d", err); + return MDBX_PANIC; + } + return rc; +} + +static __cold int mdbx_mapresize_implicit(MDBX_env *env, const pgno_t used_pgno, + const pgno_t size_pgno, + const pgno_t limit_pgno) { + const pgno_t mapped_pgno = bytes2pgno(env, env->me_dxb_mmap.limit); + mdbx_assert(env, mapped_pgno >= used_pgno); + return mdbx_mapresize( + env, used_pgno, size_pgno, + (size_pgno > mapped_pgno) + ? limit_pgno + : /* The actual mapsize may be less since the geo.upper may be changed + by other process. So, avoids remapping until it necessary. */ + mapped_pgno, + true); +} + +static int mdbx_meta_unsteady(MDBX_env *env, const txnid_t last_steady, + MDBX_meta *const meta, mdbx_filehandle_t fd) { + const uint64_t wipe = MDBX_DATASIGN_NONE; + if (unlikely(META_IS_STEADY(meta)) && + mdbx_meta_txnid_stable(env, meta) <= last_steady) { + mdbx_warning("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, last_steady, + data_page(meta)->mp_pgno); + if (env->me_flags & MDBX_WRITEMAP) + unaligned_poke_u64(4, meta->mm_datasync_sign, wipe); + else + return mdbx_pwrite(fd, &wipe, sizeof(meta->mm_datasync_sign), + (uint8_t *)&meta->mm_datasync_sign - env->me_map); + } + return MDBX_SUCCESS; +} + +__cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) { +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ + const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) + ? env->me_dsync_fd + : env->me_lazy_fd; + int err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 0), fd); + if (unlikely(err != MDBX_SUCCESS)) + return err; + err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 1), fd); + if (unlikely(err != MDBX_SUCCESS)) + return err; + err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 2), fd); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + if (env->me_flags & MDBX_WRITEMAP) { + mdbx_flush_incoherent_cpu_writeback(); + err = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + MDBX_SYNC_DATA); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } else { + if (fd == env->me_lazy_fd) { +#if MDBX_USE_SYNCFILERANGE + static bool syncfilerange_unavailable; + if (!syncfilerange_unavailable && + sync_file_range(env->me_lazy_fd, 0, pgno2bytes(env, NUM_METAS), + SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER)) { + err = errno; + if (ignore_enosys(err) == MDBX_RESULT_TRUE) + syncfilerange_unavailable = true; + } + if (syncfilerange_unavailable) +#endif /* MDBX_USE_SYNCFILERANGE */ + err = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + mdbx_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), + env->me_os_psize); + } + + /* force oldest refresh */ + atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); + return MDBX_SUCCESS; +} + +/* Allocate page numbers and memory for writing. Maintain mt_last_reclaimed, + * mt_reclaimed_pglist and mt_next_pgno. Set MDBX_TXN_ERROR on failure. + * + * If there are free pages available from older transactions, they + * are re-used first. Otherwise allocate a new page at mt_next_pgno. + * Do not modify the GC, just merge GC records into mt_reclaimed_pglist + * and move mt_last_reclaimed to say which records were consumed. Only this + * function can create mt_reclaimed_pglist and move + * mt_last_reclaimed/mt_next_pgno. + * + * [in] mc cursor A cursor handle identifying the transaction and + * database for which we are allocating. + * [in] num the number of pages to allocate. + * + * Returns 0 on success, non-zero on failure.*/ + +#define MDBX_ALLOC_CACHE 1 +#define MDBX_ALLOC_GC 2 +#define MDBX_ALLOC_NEW 4 +#define MDBX_ALLOC_SLOT 8 +#define MDBX_ALLOC_ALL (MDBX_ALLOC_CACHE | MDBX_ALLOC_GC | MDBX_ALLOC_NEW) + +__hot static struct page_result mdbx_page_alloc(MDBX_cursor *mc, + const unsigned num, int flags) { + struct page_result ret; + MDBX_txn *const txn = mc->mc_txn; + MDBX_env *const env = txn->mt_env; + + const unsigned coalesce_threshold = + env->me_maxgc_ov1page - env->me_maxgc_ov1page / 4; + if (likely(flags & MDBX_ALLOC_GC)) { + flags |= env->me_flags & (MDBX_COALESCE | MDBX_LIFORECLAIM); + if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) > coalesce_threshold) + flags &= ~MDBX_COALESCE; + if (unlikely( + /* If mc is updating the GC, then the retired-list cannot play + catch-up with itself by growing while trying to save it. */ + (mc->mc_flags & C_RECLAIMING) || + /* avoid (recursive) search inside empty tree and while tree is + updating, https://github.com/erthink/libmdbx/issues/31 */ + txn->mt_dbs[FREE_DBI].md_entries == 0 || + /* If our dirty list is already full, we can't touch GC */ + (txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth && + !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)))) + flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE); + } + + if (likely(num == 1 && (flags & MDBX_ALLOC_CACHE) != 0)) { + /* If there are any loose pages, just use them */ + mdbx_assert(env, (flags & MDBX_ALLOC_SLOT) == 0); + if (likely(txn->tw.loose_pages)) { +#if MDBX_ENABLE_REFUND + if (txn->tw.loose_refund_wl > txn->mt_next_pgno) { + mdbx_refund(txn); + if (unlikely(!txn->tw.loose_pages)) + goto no_loose; + } +#endif /* MDBX_ENABLE_REFUND */ + + ret.page = txn->tw.loose_pages; + txn->tw.loose_pages = ret.page->mp_next; + txn->tw.loose_count--; + mdbx_debug("db %d use loose page %" PRIaPGNO, DDBI(mc), + ret.page->mp_pgno); + mdbx_tassert(txn, ret.page->mp_pgno < txn->mt_next_pgno); + mdbx_ensure(env, ret.page->mp_pgno >= NUM_METAS); + VALGRIND_MAKE_MEM_UNDEFINED(page_data(ret.page), page_space(txn->mt_env)); + ASAN_UNPOISON_MEMORY_REGION(page_data(ret.page), page_space(txn->mt_env)); + ret.page->mp_txnid = txn->mt_front; + ret.err = MDBX_SUCCESS; + return ret; + } + } +#if MDBX_ENABLE_REFUND +no_loose: +#endif /* MDBX_ENABLE_REFUND */ + + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + pgno_t pgno, *re_list = txn->tw.reclaimed_pglist; + unsigned range_begin = 0, re_len = MDBX_PNL_SIZE(re_list); + txnid_t oldest = 0, last = 0; + const unsigned wanna_range = num - 1; + + while (true) { /* hsr-kick retry loop */ + MDBX_cursor_couple recur; + for (MDBX_cursor_op op = MDBX_FIRST;; + op = (flags & MDBX_LIFORECLAIM) ? MDBX_PREV : MDBX_NEXT) { + MDBX_val key, data; + + /* Seek a big enough contiguous page range. + * Prefer pages with lower pgno. */ + mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno)); + if ((flags & (MDBX_COALESCE | MDBX_ALLOC_CACHE)) == MDBX_ALLOC_CACHE && + re_len > wanna_range) { + mdbx_tassert(txn, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno && + MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno); + range_begin = MDBX_PNL_ASCENDING ? 1 : re_len; + pgno = MDBX_PNL_LEAST(re_list); + if (likely(wanna_range == 0)) + goto done; +#if MDBX_PNL_ASCENDING + mdbx_tassert(txn, pgno == re_list[1] && range_begin == 1); + while (true) { + unsigned range_end = range_begin + wanna_range; + if (re_list[range_end] - pgno == wanna_range) + goto done; + if (range_end == re_len) + break; + pgno = re_list[++range_begin]; + } +#else + mdbx_tassert(txn, pgno == re_list[re_len] && range_begin == re_len); + while (true) { + if (re_list[range_begin - wanna_range] - pgno == wanna_range) + goto done; + if (range_begin == wanna_range) + break; + pgno = re_list[--range_begin]; + } +#endif /* MDBX_PNL sort-order */ + } + + if (op == MDBX_FIRST) { /* 1st iteration, setup cursor, etc */ + if (unlikely(!(flags & MDBX_ALLOC_GC))) + break /* reclaiming is prohibited for now */; + + /* Prepare to fetch more and coalesce */ + oldest = (flags & MDBX_LIFORECLAIM) + ? mdbx_find_oldest(txn) + : atomic_load64(&env->me_lck->mti_oldest_reader, + mo_AcquireRelease); + ret.err = mdbx_cursor_init(&recur.outer, txn, FREE_DBI); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + if (flags & MDBX_LIFORECLAIM) { + /* Begin from oldest reader if any */ + if (oldest > MIN_TXNID) { + last = oldest - 1; + op = MDBX_SET_RANGE; + } + } else if (txn->tw.last_reclaimed) { + /* Continue lookup from txn->tw.last_reclaimed to oldest reader */ + last = txn->tw.last_reclaimed; + op = MDBX_SET_RANGE; + } + + key.iov_base = &last; + key.iov_len = sizeof(last); + } + + if (!(flags & MDBX_LIFORECLAIM)) { + /* Do not try fetch more if the record will be too recent */ + if (op != MDBX_FIRST && ++last >= oldest) { + oldest = mdbx_find_oldest(txn); + if (oldest <= last) + break; + } + } + + ret.err = mdbx_cursor_get(&recur.outer, &key, NULL, op); + if (ret.err == MDBX_NOTFOUND && (flags & MDBX_LIFORECLAIM)) { + if (op == MDBX_SET_RANGE) + continue; + txnid_t snap = mdbx_find_oldest(txn); + if (oldest < snap) { + oldest = snap; + last = oldest - 1; + key.iov_base = &last; + key.iov_len = sizeof(last); + op = MDBX_SET_RANGE; + ret.err = mdbx_cursor_get(&recur.outer, &key, NULL, op); + } + } + if (unlikely(ret.err)) { + if (ret.err == MDBX_NOTFOUND) + break; + goto fail; + } + + if (!MDBX_DISABLE_PAGECHECKS && + unlikely(key.iov_len != sizeof(txnid_t))) { + ret.err = MDBX_CORRUPTED; + goto fail; + } + last = unaligned_peek_u64(4, key.iov_base); + if (!MDBX_DISABLE_PAGECHECKS && + unlikely(last < MIN_TXNID || last > MAX_TXNID)) { + ret.err = MDBX_CORRUPTED; + goto fail; + } + if (oldest <= last) { + oldest = mdbx_find_oldest(txn); + if (oldest <= last) { + if (flags & MDBX_LIFORECLAIM) + continue; + break; + } + } + + if (flags & MDBX_LIFORECLAIM) { + /* skip IDs of records that already reclaimed */ + if (txn->tw.lifo_reclaimed) { + size_t i; + for (i = (size_t)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed); i > 0; --i) + if (txn->tw.lifo_reclaimed[i] == last) + break; + if (i) + continue; + } + } + + /* Reading next GC record */ + MDBX_page *const mp = recur.outer.mc_pg[recur.outer.mc_top]; + if (unlikely((ret.err = mdbx_node_read( + &recur.outer, + page_node(mp, recur.outer.mc_ki[recur.outer.mc_top]), + &data, pp_txnid4chk(mp, txn))) != MDBX_SUCCESS)) + goto fail; + + if ((flags & MDBX_LIFORECLAIM) && !txn->tw.lifo_reclaimed) { + txn->tw.lifo_reclaimed = mdbx_txl_alloc(); + if (unlikely(!txn->tw.lifo_reclaimed)) { + ret.err = MDBX_ENOMEM; + goto fail; + } + } + + /* Append PNL from GC record to tw.reclaimed_pglist */ + mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0); + pgno_t *gc_pnl = (pgno_t *)data.iov_base; + mdbx_tassert(txn, data.iov_len >= MDBX_PNL_SIZEOF(gc_pnl)); + if (unlikely(data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || + !mdbx_pnl_check(gc_pnl, txn->mt_next_pgno))) { + ret.err = MDBX_CORRUPTED; + goto fail; + } + const unsigned gc_len = MDBX_PNL_SIZE(gc_pnl); + if (unlikely(/* resulting list is tool long */ gc_len + + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) > + env->me_options.rp_augment_limit) && + (((/* not a slot-request from gc-update */ + (flags & MDBX_ALLOC_SLOT) == 0 || + (flags & MDBX_LIFORECLAIM) == 0 || + (txn->tw.lifo_reclaimed && + MDBX_PNL_SIZE(txn->tw.lifo_reclaimed))) && + /* have enough unallocated space */ pgno_add( + txn->mt_next_pgno, num) <= txn->mt_geo.upper) || + gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) >= + MDBX_PGL_LIMIT / 16 * 15)) { + /* Stop reclaiming to avoid overflow the page list. + * This is a rare case while search for a continuously multi-page region + * in a large database. https://github.com/erthink/libmdbx/issues/123 */ + mdbx_debug("stop reclaiming to avoid PNL overflow: %u (current) + %u " + "(chunk) -> %u", + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), gc_len, + gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); + flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE); + break; + } + ret.err = mdbx_pnl_need(&txn->tw.reclaimed_pglist, gc_len); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + re_list = txn->tw.reclaimed_pglist; + + /* Remember ID of GC record */ + if (flags & MDBX_LIFORECLAIM) { + ret.err = mdbx_txl_append(&txn->tw.lifo_reclaimed, last); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + } + txn->tw.last_reclaimed = last; + + if (mdbx_log_enabled(MDBX_LOG_EXTRA)) { + mdbx_debug_extra("PNL read txn %" PRIaTXN " root %" PRIaPGNO + " num %u, PNL", + last, txn->mt_dbs[FREE_DBI].md_root, gc_len); + for (unsigned i = gc_len; i; i--) + mdbx_debug_extra_print(" %" PRIaPGNO, gc_pnl[i]); + mdbx_debug_extra_print("%s\n", "."); + } + + /* Merge in descending sorted order */ + const unsigned prev_re_len = MDBX_PNL_SIZE(re_list); + mdbx_pnl_xmerge(re_list, gc_pnl); + /* re-check to avoid duplicates */ + if (!MDBX_DISABLE_PAGECHECKS && + unlikely(!mdbx_pnl_check(re_list, txn->mt_next_pgno))) { + ret.err = MDBX_CORRUPTED; + goto fail; + } + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + + re_len = MDBX_PNL_SIZE(re_list); + mdbx_tassert(txn, re_len == 0 || re_list[re_len] < txn->mt_next_pgno); + if (MDBX_ENABLE_REFUND && re_len && + unlikely(MDBX_PNL_MOST(re_list) == txn->mt_next_pgno - 1)) { + /* Refund suitable pages into "unallocated" space */ + mdbx_refund(txn); + re_list = txn->tw.reclaimed_pglist; + re_len = MDBX_PNL_SIZE(re_list); + } + + /* Done for a kick-reclaim mode, actually no page needed */ + if (unlikely(flags & MDBX_ALLOC_SLOT)) { + ret.err = MDBX_SUCCESS; + ret.page = NULL; + return ret; + } + + /* Don't try to coalesce too much. */ + if (re_len /* current size */ > coalesce_threshold || + (re_len > prev_re_len && re_len - prev_re_len /* delta from prev */ >= + coalesce_threshold / 2)) + flags &= ~MDBX_COALESCE; + } + + if (F_ISSET(flags, MDBX_COALESCE | MDBX_ALLOC_CACHE)) { + flags -= MDBX_COALESCE; + continue; + } + + /* There is no suitable pages in the GC and to be able to allocate + * we should CHOICE one of: + * - make a new steady checkpoint if reclaiming was stopped by + * the last steady-sync, or wipe it in the MDBX_UTTERLY_NOSYNC mode; + * - kick lagging reader(s) if reclaiming was stopped by ones of it. + * - extend the database file. */ + + /* Will use new pages from the map if nothing is suitable in the GC. */ + range_begin = 0; + pgno = txn->mt_next_pgno; + const pgno_t next = pgno_add(pgno, num); + + if (flags & MDBX_ALLOC_GC) { + const MDBX_meta *const head = mdbx_meta_head(env); + MDBX_meta *const steady = mdbx_meta_steady(env); + /* does reclaiming stopped at the last steady point? */ + if (head != steady && META_IS_STEADY(steady) && + oldest == mdbx_meta_txnid_stable(env, steady)) { + mdbx_debug("gc-kick-steady: head %" PRIaTXN "-%s, tail %" PRIaTXN + "-%s, oldest %" PRIaTXN, + mdbx_meta_txnid_stable(env, head), mdbx_durable_str(head), + mdbx_meta_txnid_stable(env, steady), + mdbx_durable_str(steady), oldest); + ret.err = MDBX_RESULT_TRUE; + const pgno_t autosync_threshold = + atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); + const uint64_t autosync_period = + atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); + /* wipe the last steady-point if one of: + * - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified + * - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted + * otherwise, make a new steady-point if one of: + * - auto-sync threshold is specified and reached; + * - upper limit of database size is reached; + * - database is full (with the current file size) + * AND auto-sync threshold it NOT specified */ + if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) && + ((autosync_threshold | autosync_period) == 0 || + next >= steady->mm_geo.now)) { + /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode + * without any auto-sync threshold(s). */ + ret.err = mdbx_wipe_steady(env, oldest); + mdbx_debug("gc-wipe-steady, rc %d", ret.err); + mdbx_assert(env, steady != mdbx_meta_steady(env)); + } else if ((flags & MDBX_ALLOC_NEW) == 0 || + (autosync_threshold && + atomic_load32(&env->me_lck->mti_unsynced_pages, + mo_Relaxed) >= autosync_threshold) || + (autosync_period && + mdbx_osal_monotime() - + atomic_load64(&env->me_lck->mti_sync_timestamp, + mo_Relaxed) >= + autosync_period) || + next >= txn->mt_geo.upper || + (next >= txn->mt_end_pgno && + (autosync_threshold | autosync_period) == 0)) { + /* make steady checkpoint. */ + MDBX_meta meta = *head; + ret.err = mdbx_sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta); + mdbx_debug("gc-make-steady, rc %d", ret.err); + mdbx_assert(env, steady != mdbx_meta_steady(env)); + } + if (ret.err == MDBX_SUCCESS) { + if (mdbx_find_oldest(txn) > oldest) + continue; + /* it is reasonable check/kick lagging reader(s) here, + * since we made a new steady point or wipe the last. */ + if (oldest < txn->mt_txnid - xMDBX_TXNID_STEP && + mdbx_kick_longlived_readers(env, oldest) > oldest) + continue; + } else if (unlikely(ret.err != MDBX_RESULT_TRUE)) + goto fail; + } + } + + /* don't kick lagging reader(s) if is enough unallocated space + * at the end of database file. */ + if ((flags & MDBX_ALLOC_NEW) && next <= txn->mt_end_pgno) + goto done; + if ((flags & MDBX_ALLOC_GC) && oldest < txn->mt_txnid - xMDBX_TXNID_STEP && + mdbx_kick_longlived_readers(env, oldest) > oldest) + continue; + + ret.err = MDBX_NOTFOUND; + if (flags & MDBX_ALLOC_NEW) { + ret.err = MDBX_MAP_FULL; + if (next <= txn->mt_geo.upper && txn->mt_geo.grow_pv) { + mdbx_assert(env, next > txn->mt_end_pgno); + const pgno_t grow_step = pv2pages(txn->mt_geo.grow_pv); + pgno_t aligned = pgno_align2os_pgno( + env, pgno_add(next, grow_step - next % grow_step)); + + if (aligned > txn->mt_geo.upper) + aligned = txn->mt_geo.upper; + mdbx_assert(env, aligned > txn->mt_end_pgno); + + mdbx_verbose("try growth datafile to %" PRIaPGNO " pages (+%" PRIaPGNO + ")", + aligned, aligned - txn->mt_end_pgno); + ret.err = mdbx_mapresize_implicit(env, txn->mt_next_pgno, aligned, + txn->mt_geo.upper); + if (ret.err == MDBX_SUCCESS) { + env->me_txn->mt_end_pgno = aligned; + goto done; + } + + mdbx_error("unable growth datafile to %" PRIaPGNO " pages (+%" PRIaPGNO + "), errcode %d", + aligned, aligned - txn->mt_end_pgno, ret.err); + } else { + mdbx_debug("gc-alloc: next %u > upper %u", next, txn->mt_geo.upper); + } + } + + fail: + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + if (likely(!(flags & MDBX_ALLOC_SLOT))) + txn->mt_flags |= MDBX_TXN_ERROR; + mdbx_assert(env, ret.err != MDBX_SUCCESS); + ret.page = NULL; + return ret; + } + +done: + ret.page = NULL; + if (unlikely(flags & MDBX_ALLOC_SLOT)) { + ret.err = MDBX_SUCCESS; + return ret; + } + + mdbx_ensure(env, pgno >= NUM_METAS); + if (env->me_flags & MDBX_WRITEMAP) { + ret.page = pgno2page(env, pgno); + /* LY: reset no-access flag from mdbx_page_loose() */ + VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); + ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); + } else { + if (unlikely(!(ret.page = mdbx_page_malloc(txn, num)))) { + ret.err = MDBX_ENOMEM; + goto fail; + } + } + + if (range_begin) { + mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0); + mdbx_tassert(txn, pgno < txn->mt_next_pgno); + mdbx_tassert(txn, pgno == re_list[range_begin]); + /* Cutoff allocated pages from tw.reclaimed_pglist */ +#if MDBX_PNL_ASCENDING + for (unsigned i = range_begin + num; i <= re_len;) + re_list[range_begin++] = re_list[i++]; + MDBX_PNL_SIZE(re_list) = re_len = range_begin - 1; +#else + MDBX_PNL_SIZE(re_list) = re_len -= num; + for (unsigned i = range_begin - num; i < re_len;) + re_list[++i] = re_list[++range_begin]; +#endif + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + } else { + txn->mt_next_pgno = pgno + num; + mdbx_assert(env, txn->mt_next_pgno <= txn->mt_end_pgno); + } + + if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) + memset(ret.page, -1, pgno2bytes(env, num)); + VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); + + ret.page->mp_pgno = pgno; + ret.page->mp_leaf2_ksize = 0; + ret.page->mp_flags = 0; + if ((mdbx_assert_enabled() || mdbx_audit_enabled()) && num > 1) { + ret.page->mp_pages = num; + ret.page->mp_flags = P_OVERFLOW; + } + ret.err = mdbx_page_dirty(txn, ret.page, num); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + return ret; +} + +/* Copy the used portions of a non-overflow page. */ +__hot static void mdbx_page_copy(MDBX_page *dst, const MDBX_page *src, + size_t psize) { + STATIC_ASSERT(UINT16_MAX > MAX_PAGESIZE - PAGEHDRSZ); + STATIC_ASSERT(MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 4); + if ((src->mp_flags & (P_LEAF2 | P_OVERFLOW)) == 0) { + size_t upper = src->mp_upper, lower = src->mp_lower, unused = upper - lower; + + /* If page isn't full, just copy the used portion. Adjust + * alignment so memcpy may copy words instead of bytes. */ + if (unused >= MDBX_CACHELINE_SIZE * 2) { + lower = ceil_powerof2(lower + PAGEHDRSZ, sizeof(void *)); + upper = floor_powerof2(upper + PAGEHDRSZ, sizeof(void *)); + memcpy(dst, src, lower); + dst = (void *)((char *)dst + upper); + src = (void *)((char *)src + upper); + psize -= upper; + } + } + memcpy(dst, src, psize); +} + +/* Pull a page off the txn's spill list, if present. + * + * If a page being referenced was spilled to disk in this txn, bring + * it back and make it dirty/writable again. */ +static struct page_result __must_check_result +mdbx_page_unspill(MDBX_txn *const txn, const MDBX_page *const mp) { + mdbx_verbose("unspill page %" PRIaPGNO, mp->mp_pgno); + mdbx_tassert(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); + mdbx_tassert(txn, IS_SPILLED(txn, mp)); + const pgno_t spilled_pgno = mp->mp_pgno << 1; + const MDBX_txn *scan = txn; + struct page_result ret; + do { + mdbx_tassert(txn, (scan->mt_flags & MDBX_TXN_SPILLS) != 0); + if (!scan->tw.spill_pages) + continue; + const unsigned si = mdbx_pnl_exist(scan->tw.spill_pages, spilled_pgno); + if (!si) + continue; + const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1; + ret.page = mdbx_page_malloc(txn, npages); + if (unlikely(!ret.page)) { + ret.err = MDBX_ENOMEM; + return ret; + } + mdbx_page_copy(ret.page, mp, pgno2bytes(txn->mt_env, npages)); + if (scan == txn) { + /* If in current txn, this page is no longer spilled. + * If it happens to be the last page, truncate the spill list. + * Otherwise mark it as deleted by setting the LSB. */ + mdbx_spill_remove(txn, si, npages); + } /* otherwise, if belonging to a parent txn, the + * page remains spilled until child commits */ + + ret.err = mdbx_page_dirty(txn, ret.page, npages); + if (unlikely(ret.err != MDBX_SUCCESS)) + return ret; +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&txn->mt_env->me_lck->mti_pgop_stat.unspill, npages); +#endif /* MDBX_ENABLE_PGOP_STAT */ + ret.page->mp_flags |= (scan == txn) ? 0 : P_SPILLED; + ret.err = MDBX_SUCCESS; + return ret; + } while (likely((scan = scan->mt_parent) != nullptr && + (scan->mt_flags & MDBX_TXN_SPILLS) != 0)); + mdbx_error("Page %" PRIaPGNO " mod-txnid %" PRIaTXN + " not found in the spill-list(s), current txn %" PRIaTXN + " front %" PRIaTXN ", root txn %" PRIaTXN " front %" PRIaTXN, + mp->mp_pgno, mp->mp_txnid, txn->mt_txnid, txn->mt_front, + txn->mt_env->me_txn0->mt_txnid, txn->mt_env->me_txn0->mt_front); + ret.err = MDBX_PROBLEM; + ret.page = NULL; + return ret; +} + +/* Touch a page: make it dirty and re-insert into tree with updated pgno. + * Set MDBX_TXN_ERROR on failure. + * + * [in] mc cursor pointing to the page to be touched + * + * Returns 0 on success, non-zero on failure. */ +__hot static int mdbx_page_touch(MDBX_cursor *mc) { + const MDBX_page *const mp = mc->mc_pg[mc->mc_top]; + MDBX_page *np; + MDBX_txn *txn = mc->mc_txn; + int rc; + + if (mdbx_assert_enabled()) { + if (mc->mc_flags & C_SUB) { + MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db); + MDBX_cursor_couple *couple = container_of(mx, MDBX_cursor_couple, inner); + mdbx_tassert(txn, mc->mc_db == &couple->outer.mc_xcursor->mx_db); + mdbx_tassert(txn, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); + mdbx_tassert(txn, *couple->outer.mc_dbistate & DBI_DIRTY); + } else { + mdbx_tassert(txn, *mc->mc_dbistate & DBI_DIRTY); + } + mdbx_tassert(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); + mdbx_tassert(txn, !IS_OVERFLOW(mp)); + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + } + + if (IS_MODIFIABLE(txn, mp) || IS_SUBP(mp)) + return MDBX_SUCCESS; + + if (IS_FROZEN(txn, mp)) { + /* CoW the page */ + rc = mdbx_pnl_need(&txn->tw.retired_pages, 1); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + const struct page_result par = mdbx_page_alloc(mc, 1, MDBX_ALLOC_ALL); + rc = par.err; + np = par.page; + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + + const pgno_t pgno = np->mp_pgno; + mdbx_debug("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, DDBI(mc), + mp->mp_pgno, pgno); + mdbx_tassert(txn, mp->mp_pgno != pgno); + mdbx_pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); + /* Update the parent page, if any, to point to the new page */ + if (mc->mc_top) { + MDBX_page *parent = mc->mc_pg[mc->mc_top - 1]; + MDBX_node *node = page_node(parent, mc->mc_ki[mc->mc_top - 1]); + node_set_pgno(node, pgno); + } else { + mc->mc_db->md_root = pgno; + } + +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&txn->mt_env->me_lck->mti_pgop_stat.cow, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ + mdbx_page_copy(np, mp, txn->mt_env->me_psize); + np->mp_pgno = pgno; + np->mp_txnid = txn->mt_front; + } else if (IS_SPILLED(txn, mp)) { + struct page_result pur = mdbx_page_unspill(txn, mp); + np = pur.page; + rc = pur.err; + if (likely(rc == MDBX_SUCCESS)) { + mdbx_tassert(txn, np != nullptr); + goto done; + } + goto fail; + } else { + if (unlikely(!txn->mt_parent)) { + mdbx_error("Unexpected not frozen/modifiable/spilled but shadowed %s " + "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," + " without parent transaction, current txn %" PRIaTXN + " front %" PRIaTXN, + IS_BRANCH(mp) ? "branch" : "leaf", mp->mp_pgno, mp->mp_txnid, + mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); + rc = MDBX_PROBLEM; + goto fail; + } + + mdbx_debug("clone db %d page %" PRIaPGNO, DDBI(mc), mp->mp_pgno); + mdbx_tassert(txn, txn->tw.dirtylist->length <= + MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); + /* No - copy it */ + np = mdbx_page_malloc(txn, 1); + if (unlikely(!np)) { + rc = MDBX_ENOMEM; + goto fail; + } + mdbx_page_copy(np, mp, txn->mt_env->me_psize); + + /* insert a clone of parent's dirty page, so don't touch dirtyroom */ + rc = mdbx_page_dirty(txn, np, 1); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&txn->mt_env->me_lck->mti_pgop_stat.clone, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ + } + +done: + /* Adjust cursors pointing to mp */ + mc->mc_pg[mc->mc_top] = np; + MDBX_cursor *m2 = txn->tw.cursors[mc->mc_dbi]; + if (mc->mc_flags & C_SUB) { + for (; m2; m2 = m2->mc_next) { + MDBX_cursor *m3 = &m2->mc_xcursor->mx_cursor; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) + m3->mc_pg[mc->mc_top] = np; + } + } else { + for (; m2; m2 = m2->mc_next) { + if (m2->mc_snum < mc->mc_snum) + continue; + if (m2 == mc) + continue; + if (m2->mc_pg[mc->mc_top] == mp) { + m2->mc_pg[mc->mc_top] = np; + if (XCURSOR_INITED(m2) && IS_LEAF(np)) + XCURSOR_REFRESH(m2, np, m2->mc_ki[mc->mc_top]); + } + } + } + return MDBX_SUCCESS; + +fail: + txn->mt_flags |= MDBX_TXN_ERROR; + return rc; +} + +__cold static int mdbx_env_sync_internal(MDBX_env *env, bool force, + bool nonblock) { + unsigned flags = env->me_flags & ~MDBX_NOMETASYNC; + if (unlikely(flags & (MDBX_RDONLY | MDBX_FATAL_ERROR))) + return MDBX_EACCESS; + + int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */; + bool need_unlock = false; + if (nonblock && + atomic_load32(&env->me_lck->mti_unsynced_pages, mo_AcquireRelease) == 0) + goto fastpath; + + const bool outside_txn = (env->me_txn0->mt_owner != mdbx_thread_self()); + if (outside_txn) { + int err = mdbx_txn_lock(env, nonblock); + if (unlikely(err != MDBX_SUCCESS)) + return err; + need_unlock = true; + } + + const MDBX_meta *head = mdbx_meta_head(env); + pgno_t unsynced_pages = + atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed); + if (!META_IS_STEADY(head) || unsynced_pages) { + const pgno_t autosync_threshold = + atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); + const uint64_t autosync_period = + atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); + if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) || + (autosync_period && + mdbx_osal_monotime() - + atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >= + autosync_period)) + flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; + + if (outside_txn) { + if (unsynced_pages > /* FIXME: define threshold */ 16 && + (flags & MDBX_SAFE_NOSYNC) == 0) { + mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + const size_t usedbytes = pgno_align2os_bytes(env, head->mm_geo.next); + +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ + mdbx_txn_unlock(env); + + /* LY: pre-sync without holding lock to reduce latency for writer(s) */ + int err = + (flags & MDBX_WRITEMAP) + ? mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, MDBX_SYNC_DATA) + : mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + err = mdbx_txn_lock(env, nonblock); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + /* LY: head and unsynced_pages may be changed. */ + head = mdbx_meta_head(env); + unsynced_pages = + atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed); + } + env->me_txn0->mt_txnid = meta_txnid(env, head, false); + mdbx_find_oldest(env->me_txn0); + rc = MDBX_RESULT_FALSE /* means "some data was synced" */; + } + + if (!META_IS_STEADY(head) || + ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) { + mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIaPGNO, + data_page(head)->mp_pgno, mdbx_durable_str(head), + unsynced_pages); + MDBX_meta meta = *head; + int err = mdbx_sync_locked(env, flags | MDBX_SHRINK_ALLOWED, &meta); + if (unlikely(err != MDBX_SUCCESS)) { + if (need_unlock) + mdbx_txn_unlock(env); + return err; + } + rc = MDBX_RESULT_FALSE /* means "some data was synced" */; + } + } + +fastpath: + /* LY: sync meta-pages if MDBX_NOMETASYNC enabled + * and someone was not synced above. */ + if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_NOMETASYNC) != 0) { + const txnid_t head_txnid = mdbx_recent_committed_txnid(env); + if (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != + (uint32_t)head_txnid) { +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = (flags & MDBX_WRITEMAP) + ? mdbx_msync(&env->me_dxb_mmap, 0, + pgno_align2os_bytes(env, NUM_METAS), + MDBX_SYNC_DATA | MDBX_SYNC_IODQ) + : mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + if (likely(rc == MDBX_SUCCESS)) + atomic_store32(&env->me_lck->mti_meta_sync_txnid, (uint32_t)head_txnid, + mo_Relaxed); + } + } + if (need_unlock) + mdbx_txn_unlock(env); + return rc; +} + +static __inline int check_env(const MDBX_env *env, const bool wanna_active) { + if (unlikely(!env)) + return MDBX_EINVAL; + + if (unlikely(env->me_signature.weak != MDBX_ME_SIGNATURE)) + return MDBX_EBADSIGN; + +#if MDBX_ENV_CHECKPID + if (unlikely(env->me_pid != mdbx_getpid())) { + ((MDBX_env *)env)->me_flags |= MDBX_FATAL_ERROR; + return MDBX_PANIC; + } +#endif /* MDBX_ENV_CHECKPID */ + + if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) + return MDBX_PANIC; + + if (wanna_active) { + if (unlikely((env->me_flags & MDBX_ENV_ACTIVE) == 0)) + return MDBX_EPERM; + mdbx_assert(env, env->me_map != nullptr); + } + + return MDBX_SUCCESS; +} + +__cold int mdbx_env_sync_ex(MDBX_env *env, bool force, bool nonblock) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + return mdbx_env_sync_internal(env, force, nonblock); +} + +#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API +__cold int mdbx_env_sync(MDBX_env *env) { return __inline_mdbx_env_sync(env); } + +__cold int mdbx_env_sync_poll(MDBX_env *env) { + return __inline_mdbx_env_sync_poll(env); +} +#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ + +/* Back up parent txn's cursors, then grab the originals for tracking */ +static int mdbx_cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { + for (int i = parent->mt_numdbs; --i >= 0;) { + nested->tw.cursors[i] = NULL; + MDBX_cursor *mc = parent->tw.cursors[i]; + if (mc != NULL) { + size_t size = mc->mc_xcursor ? sizeof(MDBX_cursor) + sizeof(MDBX_xcursor) + : sizeof(MDBX_cursor); + for (MDBX_cursor *bk; mc; mc = bk->mc_next) { + bk = mc; + if (mc->mc_signature != MDBX_MC_LIVE) + continue; + bk = mdbx_malloc(size); + if (unlikely(!bk)) + return MDBX_ENOMEM; + *bk = *mc; + mc->mc_backup = bk; + /* Kill pointers into src to reduce abuse: The + * user may not use mc until dst ends. But we need a valid + * txn pointer here for cursor fixups to keep working. */ + mc->mc_txn = nested; + mc->mc_db = &nested->mt_dbs[i]; + mc->mc_dbistate = &nested->mt_dbistate[i]; + MDBX_xcursor *mx = mc->mc_xcursor; + if (mx != NULL) { + *(MDBX_xcursor *)(bk + 1) = *mx; + mx->mx_cursor.mc_txn = nested; + } + mc->mc_next = nested->tw.cursors[i]; + nested->tw.cursors[i] = mc; + } + } + } + return MDBX_SUCCESS; +} + +/* Close this write txn's cursors, give parent txn's cursors back to parent. + * + * [in] txn the transaction handle. + * [in] merge true to keep changes to parent cursors, false to revert. + * + * Returns 0 on success, non-zero on failure. */ +static void mdbx_cursors_eot(MDBX_txn *txn, const bool merge) { + mdbx_tassert(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + for (int i = txn->mt_numdbs; --i >= 0;) { + MDBX_cursor *next, *mc = txn->tw.cursors[i]; + if (!mc) + continue; + txn->tw.cursors[i] = NULL; + do { + const unsigned stage = mc->mc_signature; + MDBX_cursor *bk = mc->mc_backup; + next = mc->mc_next; + mdbx_ensure(txn->mt_env, + stage == MDBX_MC_LIVE || (stage == MDBX_MC_WAIT4EOT && bk)); + mdbx_cassert(mc, mc->mc_dbi == (unsigned)i); + if (bk) { + MDBX_xcursor *mx = mc->mc_xcursor; + mdbx_cassert(mc, mx == bk->mc_xcursor); + mdbx_tassert(txn, txn->mt_parent != NULL); + mdbx_ensure(txn->mt_env, bk->mc_signature == MDBX_MC_LIVE); + if (stage == MDBX_MC_WAIT4EOT /* Cursor was closed by user */) + mc->mc_signature = stage /* Promote closed state to parent txn */; + else if (merge) { + /* Restore pointers to parent txn */ + mc->mc_next = bk->mc_next; + mc->mc_backup = bk->mc_backup; + mc->mc_txn = bk->mc_txn; + mc->mc_db = bk->mc_db; + mc->mc_dbistate = bk->mc_dbistate; + if (mx) { + if (mx != bk->mc_xcursor) { + *bk->mc_xcursor = *mx; + mx = bk->mc_xcursor; + } + mx->mx_cursor.mc_txn = bk->mc_txn; + } + } else { + /* Restore from backup, i.e. rollback/abort nested txn */ + *mc = *bk; + if (mx) + *mx = *(MDBX_xcursor *)(bk + 1); + } + bk->mc_signature = 0; + mdbx_free(bk); + } else { + mdbx_ensure(txn->mt_env, stage == MDBX_MC_LIVE); + mc->mc_signature = MDBX_MC_READY4CLOSE /* Cursor may be reused */; + mc->mc_flags = 0 /* reset C_UNTRACK */; + } + } while ((mc = next) != NULL); + } +} + +#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) +/* Find largest mvcc-snapshot still referenced by this process. */ +static pgno_t mdbx_find_largest_this(MDBX_env *env, pgno_t largest) { + MDBX_lockinfo *const lck = env->me_lck_mmap.lck; + if (likely(lck != NULL /* exclusive mode */)) { + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); + for (unsigned i = 0; i < snap_nreaders; ++i) { + retry: + if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease) == + env->me_pid) { + /* mdbx_jitter4testing(true); */ + const pgno_t snap_pages = atomic_load32( + &lck->mti_readers[i].mr_snapshot_pages_used, mo_Relaxed); + const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); + if (unlikely( + snap_pages != + atomic_load32(&lck->mti_readers[i].mr_snapshot_pages_used, + mo_AcquireRelease) || + snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) + goto retry; + if (largest < snap_pages && + atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease) <= + /* ignore pending updates */ snap_txnid && + snap_txnid <= MAX_TXNID) + largest = snap_pages; + } + } + } + return largest; +} + +static void mdbx_txn_valgrind(MDBX_env *env, MDBX_txn *txn) { +#if !defined(__SANITIZE_ADDRESS__) + if (!RUNNING_ON_VALGRIND) + return; +#endif + + if (txn) { /* transaction start */ + if (env->me_poison_edge < txn->mt_next_pgno) + env->me_poison_edge = txn->mt_next_pgno; + VALGRIND_MAKE_MEM_DEFINED(env->me_map, pgno2bytes(env, txn->mt_next_pgno)); + ASAN_UNPOISON_MEMORY_REGION(env->me_map, + pgno2bytes(env, txn->mt_next_pgno)); + /* don't touch more, it should be already poisoned */ + } else { /* transaction end */ + bool should_unlock = false; + pgno_t last = MAX_PAGENO; + if (env->me_txn0 && env->me_txn0->mt_owner == mdbx_thread_self()) { + /* inside write-txn */ + MDBX_meta *head = mdbx_meta_head(env); + last = head->mm_geo.next; + } else if (env->me_flags & MDBX_RDONLY) { + /* read-only mode, no write-txn, no wlock mutex */ + last = NUM_METAS; + } else if (mdbx_txn_lock(env, true) == MDBX_SUCCESS) { + /* no write-txn */ + last = NUM_METAS; + should_unlock = true; + } else { + /* write txn is running, therefore shouldn't poison any memory range */ + return; + } + + last = mdbx_find_largest_this(env, last); + const pgno_t edge = env->me_poison_edge; + if (edge > last) { + mdbx_assert(env, last >= NUM_METAS); + env->me_poison_edge = last; + VALGRIND_MAKE_MEM_NOACCESS(env->me_map + pgno2bytes(env, last), + pgno2bytes(env, edge - last)); + ASAN_POISON_MEMORY_REGION(env->me_map + pgno2bytes(env, last), + pgno2bytes(env, edge - last)); + } + if (should_unlock) + mdbx_txn_unlock(env); + } +} +#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ + +typedef struct { + int err; + MDBX_reader *rslot; +} bind_rslot_result; + +static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) { + mdbx_assert(env, env->me_lck_mmap.lck); + mdbx_assert(env, env->me_lck->mti_magic_and_version == MDBX_LOCK_MAGIC); + mdbx_assert(env, env->me_lck->mti_os_and_format == MDBX_LOCK_FORMAT); + + bind_rslot_result result = {mdbx_rdt_lock(env), nullptr}; + if (unlikely(MDBX_IS_ERROR(result.err))) + return result; + if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { + mdbx_rdt_unlock(env); + result.err = MDBX_PANIC; + return result; + } + if (unlikely(!env->me_map)) { + mdbx_rdt_unlock(env); + result.err = MDBX_EPERM; + return result; + } + + if (unlikely(env->me_live_reader != env->me_pid)) { + result.err = mdbx_rpid_set(env); + if (unlikely(result.err != MDBX_SUCCESS)) { + mdbx_rdt_unlock(env); + return result; + } + env->me_live_reader = env->me_pid; + } + + result.err = MDBX_SUCCESS; + unsigned slot, nreaders; + while (1) { + nreaders = atomic_load32(&env->me_lck->mti_numreaders, mo_Relaxed); + for (slot = 0; slot < nreaders; slot++) + if (atomic_load32(&env->me_lck->mti_readers[slot].mr_pid, mo_Relaxed) == + 0) + break; + + if (likely(slot < env->me_maxreaders)) + break; + + result.err = mdbx_cleanup_dead_readers(env, true, NULL); + if (result.err != MDBX_RESULT_TRUE) { + mdbx_rdt_unlock(env); + result.err = + (result.err == MDBX_SUCCESS) ? MDBX_READERS_FULL : result.err; + return result; + } + } + + result.rslot = &env->me_lck->mti_readers[slot]; + /* Claim the reader slot, carefully since other code + * uses the reader table un-mutexed: First reset the + * slot, next publish it in lck->mti_numreaders. After + * that, it is safe for mdbx_env_close() to touch it. + * When it will be closed, we can finally claim it. */ + atomic_store32(&result.rslot->mr_pid, 0, mo_Relaxed); + safe64_reset(&result.rslot->mr_txnid, true); + if (slot == nreaders) + atomic_store32(&env->me_lck->mti_numreaders, ++nreaders, mo_Relaxed); + atomic_store64(&result.rslot->mr_tid, (env->me_flags & MDBX_NOTLS) ? 0 : tid, + mo_Relaxed); + atomic_store32(&result.rslot->mr_pid, env->me_pid, mo_Relaxed); + mdbx_rdt_unlock(env); + + if (likely(env->me_flags & MDBX_ENV_TXKEY)) { + mdbx_assert(env, env->me_live_reader == env->me_pid); + thread_rthc_set(env->me_txkey, result.rslot); + } + return result; +} + +__cold int mdbx_thread_register(const MDBX_env *env) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!env->me_lck_mmap.lck)) + return (env->me_flags & MDBX_EXCLUSIVE) ? MDBX_EINVAL : MDBX_EPERM; + + if (unlikely((env->me_flags & MDBX_ENV_TXKEY) == 0)) { + mdbx_assert(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); + return MDBX_EINVAL /* MDBX_NOTLS mode */; + } + + mdbx_assert(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY | + MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); + MDBX_reader *r = thread_rthc_get(env->me_txkey); + if (unlikely(r != NULL)) { + mdbx_assert(env, r->mr_pid.weak == env->me_pid); + mdbx_assert(env, r->mr_tid.weak == mdbx_thread_self()); + if (unlikely(r->mr_pid.weak != env->me_pid)) + return MDBX_BAD_RSLOT; + return MDBX_RESULT_TRUE /* already registered */; + } + + const uintptr_t tid = mdbx_thread_self(); + if (env->me_txn0 && unlikely(env->me_txn0->mt_owner == tid)) + return MDBX_TXN_OVERLAPPING; + return bind_rslot((MDBX_env *)env, tid).err; +} + +__cold int mdbx_thread_unregister(const MDBX_env *env) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!env->me_lck_mmap.lck)) + return MDBX_RESULT_TRUE; + + if (unlikely((env->me_flags & MDBX_ENV_TXKEY) == 0)) { + mdbx_assert(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); + return MDBX_RESULT_TRUE /* MDBX_NOTLS mode */; + } + + mdbx_assert(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY | + MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); + MDBX_reader *r = thread_rthc_get(env->me_txkey); + if (unlikely(r == NULL)) + return MDBX_RESULT_TRUE /* not registered */; + + mdbx_assert(env, r->mr_pid.weak == env->me_pid); + mdbx_assert(env, r->mr_tid.weak == mdbx_thread_self()); + if (unlikely(r->mr_pid.weak != env->me_pid || + r->mr_tid.weak != mdbx_thread_self())) + return MDBX_BAD_RSLOT; + + if (unlikely(r->mr_txnid.weak < SAFE64_INVALID_THRESHOLD)) + return MDBX_BUSY /* transaction is still active */; + + atomic_store32(&r->mr_pid, 0, mo_Relaxed); + atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, + mo_AcquireRelease); + thread_rthc_set(env->me_txkey, nullptr); + return MDBX_SUCCESS; +} + +/* Common code for mdbx_txn_begin() and mdbx_txn_renew(). */ +static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { + MDBX_env *env = txn->mt_env; + int rc; + +#if MDBX_ENV_CHECKPID + if (unlikely(env->me_pid != mdbx_getpid())) { + env->me_flags |= MDBX_FATAL_ERROR; + return MDBX_PANIC; + } +#endif /* MDBX_ENV_CHECKPID */ + + STATIC_ASSERT(sizeof(MDBX_reader) == 32); +#if MDBX_LOCKING > 0 + STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_wlock) % MDBX_CACHELINE_SIZE == 0); + STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_rlock) % MDBX_CACHELINE_SIZE == 0); +#else + STATIC_ASSERT( + offsetof(MDBX_lockinfo, mti_oldest_reader) % MDBX_CACHELINE_SIZE == 0); + STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_numreaders) % MDBX_CACHELINE_SIZE == + 0); +#endif /* MDBX_LOCKING */ + STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_readers) % MDBX_CACHELINE_SIZE == + 0); + + const uintptr_t tid = mdbx_thread_self(); + if (flags & MDBX_TXN_RDONLY) { + mdbx_assert(env, (flags & ~(MDBX_TXN_RO_BEGIN_FLAGS | MDBX_WRITEMAP)) == 0); + txn->mt_flags = + MDBX_TXN_RDONLY | (env->me_flags & (MDBX_NOTLS | MDBX_WRITEMAP)); + MDBX_reader *r = txn->to.reader; + STATIC_ASSERT(sizeof(uintptr_t) <= sizeof(r->mr_tid)); + if (likely(env->me_flags & MDBX_ENV_TXKEY)) { + mdbx_assert(env, !(env->me_flags & MDBX_NOTLS)); + r = thread_rthc_get(env->me_txkey); + if (likely(r)) { + if (unlikely(!r->mr_pid.weak) && + (mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN)) { + thread_rthc_set(env->me_txkey, nullptr); + r = nullptr; + } else { + mdbx_assert(env, r->mr_pid.weak == env->me_pid); + mdbx_assert(env, r->mr_tid.weak == mdbx_thread_self()); + } + } + } else { + mdbx_assert(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); + } + + if (likely(r)) { + if (unlikely(r->mr_pid.weak != env->me_pid || + r->mr_txnid.weak < SAFE64_INVALID_THRESHOLD)) + return MDBX_BAD_RSLOT; + } else if (env->me_lck_mmap.lck) { + bind_rslot_result brs = bind_rslot(env, tid); + if (unlikely(brs.err != MDBX_SUCCESS)) + return brs.err; + r = brs.rslot; + } + txn->to.reader = r; + if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) { + mdbx_assert(env, txn->mt_txnid == 0); + mdbx_assert(env, txn->mt_owner == 0); + mdbx_assert(env, txn->mt_numdbs == 0); + if (likely(r)) { + mdbx_assert(env, r->mr_snapshot_pages_used.weak == 0); + mdbx_assert(env, r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); + atomic_store32(&r->mr_snapshot_pages_used, 0, mo_Relaxed); + } + txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; + return MDBX_SUCCESS; + } + + /* Seek & fetch the last meta */ + if (likely(/* not recovery mode */ env->me_stuck_meta < 0)) { + while (1) { + MDBX_meta *const meta = mdbx_meta_head(env); + mdbx_jitter4testing(false); + const txnid_t snap = mdbx_meta_txnid_fluid(env, meta); + mdbx_jitter4testing(false); + if (likely(r)) { + safe64_reset(&r->mr_txnid, false); + atomic_store32(&r->mr_snapshot_pages_used, meta->mm_geo.next, + mo_Relaxed); + atomic_store64(&r->mr_snapshot_pages_retired, + unaligned_peek_u64(4, meta->mm_pages_retired), + mo_Relaxed); + safe64_write(&r->mr_txnid, snap); + mdbx_jitter4testing(false); + mdbx_assert(env, r->mr_pid.weak == mdbx_getpid()); + mdbx_assert( + env, r->mr_tid.weak == + ((env->me_flags & MDBX_NOTLS) ? 0 : mdbx_thread_self())); + mdbx_assert(env, r->mr_txnid.weak == snap); + atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, + mo_AcquireRelease); + } + mdbx_jitter4testing(true); + + /* Snap the state from current meta-head */ + txn->mt_txnid = snap; + txn->mt_geo = meta->mm_geo; + memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); + txn->mt_canary = meta->mm_canary; + + /* LY: Retry on a race, ITS#7970. */ + if (likely(meta == mdbx_meta_head(env) && + snap == mdbx_meta_txnid_fluid(env, meta) && + snap >= atomic_load64(&env->me_lck->mti_oldest_reader, + mo_AcquireRelease))) { + mdbx_jitter4testing(false); + break; + } + } + } else { + /* r/o recovery mode */ + MDBX_meta *const meta = METAPAGE(env, env->me_stuck_meta); + txn->mt_txnid = mdbx_meta_txnid_stable(env, meta); + txn->mt_geo = meta->mm_geo; + memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); + txn->mt_canary = meta->mm_canary; + if (likely(r)) { + atomic_store32(&r->mr_snapshot_pages_used, meta->mm_geo.next, + mo_Relaxed); + atomic_store64(&r->mr_snapshot_pages_retired, + unaligned_peek_u64(4, meta->mm_pages_retired), + mo_Relaxed); + atomic_store64(&r->mr_txnid, txn->mt_txnid, mo_Relaxed); + mdbx_jitter4testing(false); + mdbx_assert(env, r->mr_pid.weak == mdbx_getpid()); + mdbx_assert( + env, r->mr_tid.weak == + ((env->me_flags & MDBX_NOTLS) ? 0 : mdbx_thread_self())); + mdbx_assert(env, r->mr_txnid.weak == txn->mt_txnid); + atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, + mo_Relaxed); + } + } + + if (unlikely(txn->mt_txnid < MIN_TXNID || txn->mt_txnid > MAX_TXNID)) { + mdbx_error("%s", "environment corrupted by died writer, must shutdown!"); + rc = MDBX_CORRUPTED; + goto bailout; + } + mdbx_assert(env, txn->mt_txnid >= env->me_lck->mti_oldest_reader.weak); + txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ + mdbx_ensure(env, txn->mt_txnid >= + /* paranoia is appropriate here */ env->me_lck + ->mti_oldest_reader.weak); + txn->mt_numdbs = env->me_numdbs; + } else { + mdbx_assert(env, (flags & ~(MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS | + MDBX_WRITEMAP)) == 0); + if (unlikely(txn->mt_owner == tid || + /* not recovery mode */ env->me_stuck_meta >= 0)) + return MDBX_BUSY; + MDBX_lockinfo *const lck = env->me_lck_mmap.lck; + if (lck && (env->me_flags & MDBX_NOTLS) == 0 && + (mdbx_runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) { + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); + for (unsigned i = 0; i < snap_nreaders; ++i) { + if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_Relaxed) == + env->me_pid && + unlikely(atomic_load64(&lck->mti_readers[i].mr_tid, mo_Relaxed) == + tid)) { + const txnid_t txnid = safe64_read(&lck->mti_readers[i].mr_txnid); + if (txnid >= MIN_TXNID && txnid <= MAX_TXNID) + return MDBX_TXN_OVERLAPPING; + } + } + } + + /* Not yet touching txn == env->me_txn0, it may be active */ + mdbx_jitter4testing(false); + rc = mdbx_txn_lock(env, F_ISSET(flags, MDBX_TXN_TRY)); + if (unlikely(rc)) + return rc; + if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { + mdbx_txn_unlock(env); + return MDBX_PANIC; + } +#if defined(_WIN32) || defined(_WIN64) + if (unlikely(!env->me_map)) { + mdbx_txn_unlock(env); + return MDBX_EPERM; + } +#endif /* Windows */ + + mdbx_jitter4testing(false); + MDBX_meta *meta = mdbx_meta_head(env); + mdbx_jitter4testing(false); + txn->mt_canary = meta->mm_canary; + const txnid_t snap = mdbx_meta_txnid_stable(env, meta); + txn->mt_txnid = safe64_txnid_next(snap); + if (unlikely(txn->mt_txnid > MAX_TXNID)) { + rc = MDBX_TXN_FULL; + mdbx_error("txnid overflow, raise %d", rc); + goto bailout; + } + + txn->mt_flags = flags; + txn->mt_child = NULL; + txn->tw.loose_pages = NULL; + txn->tw.loose_count = 0; +#if MDBX_ENABLE_REFUND + txn->tw.loose_refund_wl = 0; +#endif /* MDBX_ENABLE_REFUND */ + MDBX_PNL_SIZE(txn->tw.retired_pages) = 0; + txn->tw.spill_pages = NULL; + txn->tw.spill_least_removed = 0; + txn->tw.last_reclaimed = 0; + if (txn->tw.lifo_reclaimed) + MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) = 0; + env->me_txn = txn; + txn->mt_numdbs = env->me_numdbs; + memcpy(txn->mt_dbiseqs, env->me_dbiseqs, txn->mt_numdbs * sizeof(unsigned)); + /* Copy the DB info and flags */ + memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); + /* Moved to here to avoid a data race in read TXNs */ + txn->mt_geo = meta->mm_geo; + + rc = mdbx_dpl_alloc(txn); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + txn->tw.dirtyroom = txn->mt_env->me_options.dp_limit; + txn->tw.dirtylru = MDBX_DEBUG ? ~42u : 0; + } + + /* Setup db info */ + mdbx_compiler_barrier(); + for (unsigned i = CORE_DBS; i < txn->mt_numdbs; i++) { + const unsigned db_flags = env->me_dbflags[i]; + txn->mt_dbs[i].md_flags = db_flags & DB_PERSISTENT_FLAGS; + txn->mt_dbistate[i] = + (db_flags & DB_VALID) ? DBI_VALID | DBI_USRVALID | DBI_STALE : 0; + } + txn->mt_dbistate[MAIN_DBI] = DBI_VALID | DBI_USRVALID; + txn->mt_dbistate[FREE_DBI] = DBI_VALID; + txn->mt_front = + txn->mt_txnid + ((flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == 0); + + if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { + mdbx_warning("%s", "environment had fatal error, must shutdown!"); + rc = MDBX_PANIC; + } else { + const size_t size = + pgno2bytes(env, (txn->mt_flags & MDBX_TXN_RDONLY) ? txn->mt_next_pgno + : txn->mt_end_pgno); + if (unlikely(size > env->me_dxb_mmap.limit)) { + if (txn->mt_geo.upper > MAX_PAGENO || + bytes2pgno(env, pgno2bytes(env, txn->mt_geo.upper)) != + txn->mt_geo.upper) { + rc = MDBX_UNABLE_EXTEND_MAPSIZE; + goto bailout; + } + rc = mdbx_mapresize(env, txn->mt_next_pgno, txn->mt_end_pgno, + txn->mt_geo.upper, + (txn->mt_flags & MDBX_TXN_RDONLY) ? true : false); + if (rc != MDBX_SUCCESS) + goto bailout; + } + if (txn->mt_flags & MDBX_TXN_RDONLY) { +#if defined(_WIN32) || defined(_WIN64) + if (((size > env->me_dbgeo.lower && env->me_dbgeo.shrink) || + (mdbx_RunningUnderWine() && + /* under Wine acquisition of remap_guard is always required, + * since Wine don't support section extending, + * i.e. in both cases unmap+map are required. */ + size < env->me_dbgeo.upper && env->me_dbgeo.grow)) && + /* avoid recursive use SRW */ (txn->mt_flags & MDBX_NOTLS) == 0) { + txn->mt_flags |= MDBX_SHRINK_ALLOWED; + mdbx_srwlock_AcquireShared(&env->me_remap_guard); + } +#endif /* Windows */ + } else { + env->me_dxb_mmap.current = size; +#if defined(_WIN32) || defined(_WIN64) + env->me_dxb_mmap.filesize = + (env->me_dxb_mmap.filesize < size) ? size : env->me_dxb_mmap.filesize; +#endif /* Windows */ + } +#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) + mdbx_txn_valgrind(env, txn); +#endif + txn->mt_owner = tid; + return MDBX_SUCCESS; + } +bailout: + mdbx_tassert(txn, rc != MDBX_SUCCESS); + mdbx_txn_end(txn, MDBX_END_SLOT | MDBX_END_FAIL_BEGIN); + return rc; +} + +static __always_inline int check_txn(const MDBX_txn *txn, int bad_bits) { + if (unlikely(!txn)) + return MDBX_EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDBX_EBADSIGN; + + if (unlikely(txn->mt_flags & bad_bits)) + return MDBX_BAD_TXN; + +#if MDBX_TXN_CHECKOWNER + if ((txn->mt_flags & MDBX_NOTLS) == 0 && + unlikely(txn->mt_owner != mdbx_thread_self())) + return txn->mt_owner ? MDBX_THREAD_MISMATCH : MDBX_BAD_TXN; +#endif /* MDBX_TXN_CHECKOWNER */ + + if (unlikely(!txn->mt_env->me_map)) + return MDBX_EPERM; + + return MDBX_SUCCESS; +} + +static __always_inline int check_txn_rw(const MDBX_txn *txn, int bad_bits) { + int err = check_txn(txn, bad_bits); + if (unlikely(err)) + return err; + + if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) + return MDBX_EACCESS; + + return MDBX_SUCCESS; +} + +int mdbx_txn_renew(MDBX_txn *txn) { + if (unlikely(!txn)) + return MDBX_EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDBX_EBADSIGN; + + if (unlikely((txn->mt_flags & MDBX_TXN_RDONLY) == 0)) + return MDBX_EINVAL; + + int rc; + if (unlikely(txn->mt_owner != 0 || !(txn->mt_flags & MDBX_TXN_FINISHED))) { + rc = mdbx_txn_reset(txn); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + + rc = mdbx_txn_renew0(txn, MDBX_TXN_RDONLY); + if (rc == MDBX_SUCCESS) { + txn->mt_owner = mdbx_thread_self(); + mdbx_debug("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + txn->mt_txnid, (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', + (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root, + txn->mt_dbs[FREE_DBI].md_root); + } + return rc; +} + +#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API +int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, + MDBX_txn **ret) { + return __inline_mdbx_txn_begin(env, parent, flags, ret); +} +#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ + +int mdbx_txn_set_userctx(MDBX_txn *txn, void *ctx) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + txn->mt_userctx = ctx; + return MDBX_SUCCESS; +} + +void *mdbx_txn_get_userctx(const MDBX_txn *txn) { + return check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD) + ? nullptr + : txn->mt_userctx; +} + +int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, + MDBX_txn **ret, void *context) { + MDBX_txn *txn; + unsigned size, tsize; + + if (unlikely(!ret)) + return MDBX_EINVAL; + *ret = NULL; + + if (unlikely((flags & ~MDBX_TXN_RW_BEGIN_FLAGS) && + (flags & ~MDBX_TXN_RO_BEGIN_FLAGS))) + return MDBX_EINVAL; + + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(env->me_flags & MDBX_RDONLY & + ~flags)) /* write txn in RDONLY env */ + return MDBX_EACCESS; + + flags |= env->me_flags & MDBX_WRITEMAP; + + if (parent) { + /* Nested transactions: Max 1 child, write txns only, no writemap */ + rc = check_txn_rw(parent, + MDBX_TXN_RDONLY | MDBX_WRITEMAP | MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (env->me_options.spill_parent4child_denominator) { + /* Spill dirty-pages of parent to provide dirtyroom for child txn */ + rc = mdbx_txn_spill(parent, nullptr, + parent->tw.dirtylist->length / + env->me_options.spill_parent4child_denominator); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0); + + flags |= parent->mt_flags & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS); + /* Child txns save MDBX_pgstate and use own copy of cursors */ + size = env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + 1); + size += tsize = sizeof(MDBX_txn); + } else if (flags & MDBX_TXN_RDONLY) { + if (env->me_txn0 && + unlikely(env->me_txn0->mt_owner == mdbx_thread_self()) && + (mdbx_runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) + return MDBX_TXN_OVERLAPPING; + size = env->me_maxdbs * (sizeof(MDBX_db) + 1); + size += tsize = sizeof(MDBX_txn); + } else { + /* Reuse preallocated write txn. However, do not touch it until + * mdbx_txn_renew0() succeeds, since it currently may be active. */ + txn = env->me_txn0; + goto renew; + } + if (unlikely((txn = mdbx_malloc(size)) == NULL)) { + mdbx_debug("calloc: %s", "failed"); + return MDBX_ENOMEM; + } + memset(txn, 0, tsize); + txn->mt_dbxs = env->me_dbxs; /* static */ + txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); + txn->mt_dbistate = (uint8_t *)txn + size - env->me_maxdbs; + txn->mt_flags = flags; + txn->mt_env = env; + + if (parent) { + mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + txn->tw.cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbiseqs = parent->mt_dbiseqs; + txn->mt_geo = parent->mt_geo; + rc = mdbx_dpl_alloc(txn); + if (likely(rc == MDBX_SUCCESS)) { + const unsigned len = + MDBX_PNL_SIZE(parent->tw.reclaimed_pglist) + parent->tw.loose_count; + txn->tw.reclaimed_pglist = + mdbx_pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL); + if (unlikely(!txn->tw.reclaimed_pglist)) + rc = MDBX_ENOMEM; + } + if (unlikely(rc != MDBX_SUCCESS)) { + nested_failed: + mdbx_pnl_free(txn->tw.reclaimed_pglist); + mdbx_dpl_free(txn); + mdbx_free(txn); + return rc; + } + + /* Move loose pages to reclaimed list */ + if (parent->tw.loose_count) { + do { + MDBX_page *lp = parent->tw.loose_pages; + const unsigned di = mdbx_dpl_exist(parent, lp->mp_pgno); + mdbx_tassert(parent, di && parent->tw.dirtylist->items[di].ptr == lp); + mdbx_tassert(parent, lp->mp_flags == P_LOOSE); + rc = + mdbx_pnl_insert_range(&parent->tw.reclaimed_pglist, lp->mp_pgno, 1); + if (unlikely(rc != MDBX_SUCCESS)) + goto nested_failed; + parent->tw.loose_pages = lp->mp_next; + /* Remove from dirty list */ + mdbx_page_wash(parent, di, lp, 1); + } while (parent->tw.loose_pages); + parent->tw.loose_count = 0; +#if MDBX_ENABLE_REFUND + parent->tw.loose_refund_wl = 0; +#endif /* MDBX_ENABLE_REFUND */ + mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + } + txn->tw.dirtyroom = parent->tw.dirtyroom; + txn->tw.dirtylru = parent->tw.dirtylru; + + mdbx_dpl_sort(parent); + if (parent->tw.spill_pages) + mdbx_spill_purge(parent); + + mdbx_tassert(txn, MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) >= + MDBX_PNL_SIZE(parent->tw.reclaimed_pglist)); + memcpy(txn->tw.reclaimed_pglist, parent->tw.reclaimed_pglist, + MDBX_PNL_SIZEOF(parent->tw.reclaimed_pglist)); + mdbx_assert(env, mdbx_pnl_check4assert( + txn->tw.reclaimed_pglist, + (txn->mt_next_pgno /* LY: intentional assignment here, + only for assertion */ + = parent->mt_next_pgno) - + MDBX_ENABLE_REFUND)); + + txn->tw.last_reclaimed = parent->tw.last_reclaimed; + if (parent->tw.lifo_reclaimed) { + txn->tw.lifo_reclaimed = parent->tw.lifo_reclaimed; + parent->tw.lifo_reclaimed = + (void *)(intptr_t)MDBX_PNL_SIZE(parent->tw.lifo_reclaimed); + } + + txn->tw.retired_pages = parent->tw.retired_pages; + parent->tw.retired_pages = + (void *)(intptr_t)MDBX_PNL_SIZE(parent->tw.retired_pages); + + txn->mt_txnid = parent->mt_txnid; + txn->mt_front = parent->mt_front + 1; +#if MDBX_ENABLE_REFUND + txn->tw.loose_refund_wl = 0; +#endif /* MDBX_ENABLE_REFUND */ + txn->mt_canary = parent->mt_canary; + parent->mt_flags |= MDBX_TXN_HAS_CHILD; + parent->mt_child = txn; + txn->mt_parent = parent; + txn->mt_numdbs = parent->mt_numdbs; + txn->mt_owner = parent->mt_owner; + memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); + /* Copy parent's mt_dbistate, but clear DB_NEW */ + for (unsigned i = 0; i < txn->mt_numdbs; i++) + txn->mt_dbistate[i] = + parent->mt_dbistate[i] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); + mdbx_tassert(parent, + parent->tw.dirtyroom + parent->tw.dirtylist->length == + (parent->mt_parent ? parent->mt_parent->tw.dirtyroom + : parent->mt_env->me_options.dp_limit)); + mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + env->me_txn = txn; + rc = mdbx_cursor_shadow(parent, txn); + if (mdbx_audit_enabled() && mdbx_assert_enabled()) { + txn->mt_signature = MDBX_MT_SIGNATURE; + mdbx_tassert(txn, mdbx_audit_ex(txn, 0, false) == 0); + } + if (unlikely(rc != MDBX_SUCCESS)) + mdbx_txn_end(txn, MDBX_END_FAIL_BEGINCHILD); + } else { /* MDBX_TXN_RDONLY */ + txn->mt_dbiseqs = env->me_dbiseqs; + renew: + rc = mdbx_txn_renew0(txn, flags); + } + + if (unlikely(rc != MDBX_SUCCESS)) { + if (txn != env->me_txn0) + mdbx_free(txn); + } else { + if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) + mdbx_assert(env, txn->mt_flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED)); + else if (flags & MDBX_TXN_RDONLY) + mdbx_assert(env, (txn->mt_flags & + ~(MDBX_NOTLS | MDBX_TXN_RDONLY | MDBX_WRITEMAP | + /* Win32: SRWL flag */ MDBX_SHRINK_ALLOWED)) == 0); + else { + mdbx_assert(env, (txn->mt_flags & ~(MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | + MDBX_NOMETASYNC | MDBX_SAFE_NOSYNC | + MDBX_TXN_SPILLS)) == 0); + assert(!txn->tw.spill_pages && !txn->tw.spill_least_removed); + } + txn->mt_signature = MDBX_MT_SIGNATURE; + txn->mt_userctx = context; + *ret = txn; + mdbx_debug("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + txn->mt_txnid, (flags & MDBX_TXN_RDONLY) ? 'r' : 'w', + (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root, + txn->mt_dbs[FREE_DBI].md_root); + } + + return rc; +} + +int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!info)) + return MDBX_EINVAL; + + MDBX_env *const env = txn->mt_env; +#if MDBX_ENV_CHECKPID + if (unlikely(env->me_pid != mdbx_getpid())) { + env->me_flags |= MDBX_FATAL_ERROR; + return MDBX_PANIC; + } +#endif /* MDBX_ENV_CHECKPID */ + + info->txn_id = txn->mt_txnid; + info->txn_space_used = pgno2bytes(env, txn->mt_geo.next); + + if (txn->mt_flags & MDBX_TXN_RDONLY) { + const MDBX_meta *head_meta; + txnid_t head_txnid; + uint64_t head_retired; + do { + /* fetch info from volatile head */ + head_meta = mdbx_meta_head(env); + head_txnid = mdbx_meta_txnid_fluid(env, head_meta); + head_retired = unaligned_peek_u64(4, head_meta->mm_pages_retired); + info->txn_space_limit_soft = pgno2bytes(env, head_meta->mm_geo.now); + info->txn_space_limit_hard = pgno2bytes(env, head_meta->mm_geo.upper); + info->txn_space_leftover = + pgno2bytes(env, head_meta->mm_geo.now - head_meta->mm_geo.next); + mdbx_compiler_barrier(); + } while (unlikely(head_meta != mdbx_meta_head(env) || + head_txnid != mdbx_meta_txnid_fluid(env, head_meta))); + + info->txn_reader_lag = head_txnid - info->txn_id; + info->txn_space_dirty = info->txn_space_retired = 0; + uint64_t reader_snapshot_pages_retired; + if (txn->to.reader && + head_retired > + (reader_snapshot_pages_retired = atomic_load64( + &txn->to.reader->mr_snapshot_pages_retired, mo_Relaxed))) { + info->txn_space_dirty = info->txn_space_retired = pgno2bytes( + env, (pgno_t)(head_retired - reader_snapshot_pages_retired)); + + size_t retired_next_reader = 0; + MDBX_lockinfo *const lck = env->me_lck_mmap.lck; + if (scan_rlt && info->txn_reader_lag > 1 && lck) { + /* find next more recent reader */ + txnid_t next_reader = head_txnid; + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); + for (unsigned i = 0; i < snap_nreaders; ++i) { + retry: + if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { + mdbx_jitter4testing(true); + const txnid_t snap_txnid = + safe64_read(&lck->mti_readers[i].mr_txnid); + const uint64_t snap_retired = + atomic_load64(&lck->mti_readers[i].mr_snapshot_pages_retired, + mo_AcquireRelease); + if (unlikely(snap_retired != + atomic_load64( + &lck->mti_readers[i].mr_snapshot_pages_retired, + mo_Relaxed)) || + snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid)) + goto retry; + if (snap_txnid <= txn->mt_txnid) { + retired_next_reader = 0; + break; + } + if (snap_txnid < next_reader) { + next_reader = snap_txnid; + retired_next_reader = pgno2bytes( + env, (pgno_t)(snap_retired - + atomic_load64( + &txn->to.reader->mr_snapshot_pages_retired, + mo_Relaxed))); + } + } + } + } + info->txn_space_dirty = retired_next_reader; + } + } else { + info->txn_space_limit_soft = pgno2bytes(env, txn->mt_geo.now); + info->txn_space_limit_hard = pgno2bytes(env, txn->mt_geo.upper); + info->txn_space_retired = pgno2bytes( + env, txn->mt_child ? (unsigned)(uintptr_t)txn->tw.retired_pages + : MDBX_PNL_SIZE(txn->tw.retired_pages)); + info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom); + info->txn_space_dirty = + pgno2bytes(env, txn->mt_env->me_options.dp_limit - txn->tw.dirtyroom); + info->txn_reader_lag = INT64_MAX; + MDBX_lockinfo *const lck = env->me_lck_mmap.lck; + if (scan_rlt && lck) { + txnid_t oldest_snapshot = txn->mt_txnid; + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); + if (snap_nreaders) { + oldest_snapshot = mdbx_find_oldest(txn); + if (oldest_snapshot == txn->mt_txnid - 1) { + /* check if there is at least one reader */ + bool exists = false; + for (unsigned i = 0; i < snap_nreaders; ++i) { + if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_Relaxed) && + txn->mt_txnid > safe64_read(&lck->mti_readers[i].mr_txnid)) { + exists = true; + break; + } + } + oldest_snapshot += !exists; + } + } + info->txn_reader_lag = txn->mt_txnid - oldest_snapshot; + } + } + + return MDBX_SUCCESS; +} + +MDBX_env *mdbx_txn_env(const MDBX_txn *txn) { + if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE || + txn->mt_env->me_signature.weak != MDBX_ME_SIGNATURE)) + return NULL; + return txn->mt_env; +} + +uint64_t mdbx_txn_id(const MDBX_txn *txn) { + if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) + return 0; + return txn->mt_txnid; +} + +int mdbx_txn_flags(const MDBX_txn *txn) { + if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) + return -1; + return txn->mt_flags; +} + +/* Check for misused dbi handles */ +#define TXN_DBI_CHANGED(txn, dbi) \ + ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) + +static void dbi_import_locked(MDBX_txn *txn) { + MDBX_env *const env = txn->mt_env; + const unsigned n = env->me_numdbs; + for (unsigned i = CORE_DBS; i < n; ++i) { + if (i >= txn->mt_numdbs) { + txn->mt_dbistate[i] = 0; + if (!(txn->mt_flags & MDBX_TXN_RDONLY)) + txn->tw.cursors[i] = NULL; + } + if ((env->me_dbflags[i] & DB_VALID) && + !(txn->mt_dbistate[i] & DBI_USRVALID)) { + txn->mt_dbiseqs[i] = env->me_dbiseqs[i]; + txn->mt_dbs[i].md_flags = env->me_dbflags[i] & DB_PERSISTENT_FLAGS; + txn->mt_dbistate[i] = DBI_VALID | DBI_USRVALID | DBI_STALE; + mdbx_tassert(txn, txn->mt_dbxs[i].md_cmp != NULL); + mdbx_tassert(txn, txn->mt_dbxs[i].md_name.iov_base != NULL); + } + } + txn->mt_numdbs = n; +} + +/* Import DBI which opened after txn started into context */ +static __cold bool dbi_import(MDBX_txn *txn, MDBX_dbi dbi) { + if (dbi < CORE_DBS || dbi >= txn->mt_env->me_numdbs) + return false; + + mdbx_ensure(txn->mt_env, mdbx_fastmutex_acquire(&txn->mt_env->me_dbi_lock) == + MDBX_SUCCESS); + dbi_import_locked(txn); + mdbx_ensure(txn->mt_env, mdbx_fastmutex_release(&txn->mt_env->me_dbi_lock) == + MDBX_SUCCESS); + return txn->mt_dbistate[dbi] & DBI_USRVALID; +} + +/* Export or close DBI handles opened in this txn. */ +static void dbi_update(MDBX_txn *txn, int keep) { + mdbx_tassert(txn, !txn->mt_parent && txn == txn->mt_env->me_txn0); + MDBX_dbi n = txn->mt_numdbs; + if (n) { + bool locked = false; + MDBX_env *const env = txn->mt_env; + + for (unsigned i = n; --i >= CORE_DBS;) { + if (likely((txn->mt_dbistate[i] & DBI_CREAT) == 0)) + continue; + if (!locked) { + mdbx_ensure(env, + mdbx_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); + locked = true; + } + if (env->me_numdbs <= i || txn->mt_dbiseqs[i] != env->me_dbiseqs[i]) + continue /* dbi explicitly closed and/or then re-opened by other txn */; + if (keep) { + env->me_dbflags[i] = txn->mt_dbs[i].md_flags | DB_VALID; + } else { + char *ptr = env->me_dbxs[i].md_name.iov_base; + if (ptr) { + env->me_dbxs[i].md_name.iov_len = 0; + mdbx_memory_fence(mo_AcquireRelease, true); + mdbx_assert(env, env->me_dbflags[i] == 0); + env->me_dbiseqs[i]++; + env->me_dbxs[i].md_name.iov_base = NULL; + mdbx_free(ptr); + } + } + } + + n = env->me_numdbs; + if (n > CORE_DBS && unlikely(!(env->me_dbflags[n - 1] & DB_VALID))) { + if (!locked) { + mdbx_ensure(env, + mdbx_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); + locked = true; + } + + n = env->me_numdbs; + while (n > CORE_DBS && !(env->me_dbflags[n - 1] & DB_VALID)) + --n; + env->me_numdbs = n; + } + + if (unlikely(locked)) + mdbx_ensure(env, + mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + } +} + +/* Filter-out pgno list from transaction's dirty-page list */ +static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, + const bool spilled) { + if (MDBX_PNL_SIZE(pl) && txn->tw.dirtylist->length) { + mdbx_tassert(txn, mdbx_pnl_check4assert(pl, txn->mt_next_pgno << spilled)); + MDBX_dpl *dl = mdbx_dpl_sort(txn); + + /* Scanning in ascend order */ + const int step = MDBX_PNL_ASCENDING ? 1 : -1; + const int begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_SIZE(pl); + const int end = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(pl) + 1 : 0; + mdbx_tassert(txn, pl[begin] <= pl[end - step]); + + unsigned r = mdbx_dpl_search(txn, pl[begin] >> spilled); + mdbx_tassert(txn, dl->sorted == dl->length); + for (int i = begin; r <= dl->length;) { /* scan loop */ + assert(i != end); + mdbx_tassert(txn, !spilled || (pl[i] & 1) == 0); + pgno_t pl_pgno = pl[i] >> spilled; + pgno_t dp_pgno = dl->items[r].pgno; + if (likely(dp_pgno != pl_pgno)) { + const bool cmp = dp_pgno < pl_pgno; + r += cmp; + i += cmp ? 0 : step; + if (likely(i != end)) + continue; + return; + } + + /* update loop */ + unsigned w = r; + remove_dl: + if ((txn->mt_env->me_flags & MDBX_WRITEMAP) == 0) { + MDBX_page *dp = dl->items[r].ptr; + mdbx_dpage_free(txn->mt_env, dp, dpl_npages(dl, r)); + } + ++r; + next_i: + i += step; + if (unlikely(i == end)) { + while (r <= dl->length) + dl->items[w++] = dl->items[r++]; + } else { + while (r <= dl->length) { + assert(i != end); + mdbx_tassert(txn, !spilled || (pl[i] & 1) == 0); + pl_pgno = pl[i] >> spilled; + dp_pgno = dl->items[r].pgno; + if (dp_pgno < pl_pgno) + dl->items[w++] = dl->items[r++]; + else if (dp_pgno > pl_pgno) + goto next_i; + else + goto remove_dl; + } + } + dl->sorted = dpl_setlen(dl, w - 1); + txn->tw.dirtyroom += r - w; + mdbx_tassert(txn, + txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + return; + } + } +} + +/* End a transaction, except successful commit of a nested transaction. + * May be called twice for readonly txns: First reset it, then abort. + * [in] txn the transaction handle to end + * [in] mode why and how to end the transaction */ +static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode) { + MDBX_env *env = txn->mt_env; + static const char *const names[] = MDBX_END_NAMES; + +#if MDBX_ENV_CHECKPID + if (unlikely(txn->mt_env->me_pid != mdbx_getpid())) { + env->me_flags |= MDBX_FATAL_ERROR; + return MDBX_PANIC; + } +#endif /* MDBX_ENV_CHECKPID */ + + mdbx_debug("%s txn %" PRIaTXN "%c %p on mdbenv %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + names[mode & MDBX_END_OPMASK], txn->mt_txnid, + (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, + (void *)env, txn->mt_dbs[MAIN_DBI].md_root, + txn->mt_dbs[FREE_DBI].md_root); + + mdbx_ensure(env, txn->mt_txnid >= + /* paranoia is appropriate here */ env->me_lck + ->mti_oldest_reader.weak); + + int rc = MDBX_SUCCESS; + if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) { + if (txn->to.reader) { + MDBX_reader *slot = txn->to.reader; + mdbx_assert(env, slot->mr_pid.weak == env->me_pid); + if (likely(!F_ISSET(txn->mt_flags, MDBX_TXN_FINISHED))) { + mdbx_assert(env, txn->mt_txnid == slot->mr_txnid.weak && + slot->mr_txnid.weak >= + env->me_lck->mti_oldest_reader.weak); +#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) + mdbx_txn_valgrind(env, nullptr); +#endif + atomic_store32(&slot->mr_snapshot_pages_used, 0, mo_Relaxed); + safe64_reset(&slot->mr_txnid, false); + atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, + mo_Relaxed); + } else { + mdbx_assert(env, slot->mr_pid.weak == env->me_pid); + mdbx_assert(env, slot->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); + } + if (mode & MDBX_END_SLOT) { + if ((env->me_flags & MDBX_ENV_TXKEY) == 0) + atomic_store32(&slot->mr_pid, 0, mo_Relaxed); + txn->to.reader = NULL; + } + } +#if defined(_WIN32) || defined(_WIN64) + if (txn->mt_flags & MDBX_SHRINK_ALLOWED) + mdbx_srwlock_ReleaseShared(&env->me_remap_guard); +#endif + txn->mt_numdbs = 0; /* prevent further DBI activity */ + txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; + txn->mt_owner = 0; + } else if (!F_ISSET(txn->mt_flags, MDBX_TXN_FINISHED)) { +#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) + if (txn == env->me_txn0) + mdbx_txn_valgrind(env, nullptr); +#endif + if (!(mode & MDBX_END_EOTDONE)) /* !(already closed cursors) */ + mdbx_cursors_eot(txn, false); + + txn->mt_flags = MDBX_TXN_FINISHED; + txn->mt_owner = 0; + env->me_txn = txn->mt_parent; + mdbx_pnl_free(txn->tw.spill_pages); + txn->tw.spill_pages = nullptr; + if (txn == env->me_txn0) { + mdbx_assert(env, txn->mt_parent == NULL); + /* Export or close DBI handles created in this txn */ + dbi_update(txn, mode & MDBX_END_UPDATE); + mdbx_pnl_shrink(&txn->tw.retired_pages); + mdbx_pnl_shrink(&txn->tw.reclaimed_pglist); + if (!(env->me_flags & MDBX_WRITEMAP)) + mdbx_dlist_free(txn); + /* The writer mutex was locked in mdbx_txn_begin. */ + mdbx_txn_unlock(env); + } else { + mdbx_assert(env, txn->mt_parent != NULL); + MDBX_txn *const parent = txn->mt_parent; + mdbx_assert(env, parent->mt_signature == MDBX_MT_SIGNATURE); + mdbx_assert(env, parent->mt_child == txn && + (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); + mdbx_assert( + env, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + + if (txn->tw.lifo_reclaimed) { + mdbx_assert(env, MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) >= + (unsigned)(uintptr_t)parent->tw.lifo_reclaimed); + MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) = + (unsigned)(uintptr_t)parent->tw.lifo_reclaimed; + parent->tw.lifo_reclaimed = txn->tw.lifo_reclaimed; + } + + if (txn->tw.retired_pages) { + mdbx_assert(env, MDBX_PNL_SIZE(txn->tw.retired_pages) >= + (unsigned)(uintptr_t)parent->tw.retired_pages); + MDBX_PNL_SIZE(txn->tw.retired_pages) = + (unsigned)(uintptr_t)parent->tw.retired_pages; + parent->tw.retired_pages = txn->tw.retired_pages; + } + + parent->mt_child = nullptr; + parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; + parent->tw.dirtylru = txn->tw.dirtylru; + mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0); + if (!(env->me_flags & MDBX_WRITEMAP)) + mdbx_dlist_free(txn); + mdbx_dpl_free(txn); + mdbx_pnl_free(txn->tw.reclaimed_pglist); + + if (parent->mt_geo.upper != txn->mt_geo.upper || + parent->mt_geo.now != txn->mt_geo.now) { + /* undo resize performed by child txn */ + rc = mdbx_mapresize_implicit(env, parent->mt_next_pgno, + parent->mt_geo.now, parent->mt_geo.upper); + if (rc == MDBX_RESULT_TRUE) { + /* unable undo resize (it is regular for Windows), + * therefore promote size changes from child to the parent txn */ + mdbx_warning("unable undo resize performed by child txn, promote to " + "the parent (%u->%u, %u->%u)", + txn->mt_geo.now, parent->mt_geo.now, txn->mt_geo.upper, + parent->mt_geo.upper); + parent->mt_geo.now = txn->mt_geo.now; + parent->mt_geo.upper = txn->mt_geo.upper; + rc = MDBX_SUCCESS; + } else if (unlikely(rc != MDBX_SUCCESS)) { + mdbx_error("error %d while undo resize performed by child txn, fail " + "the parent", + rc); + parent->mt_flags |= MDBX_TXN_ERROR; + if (!env->me_dxb_mmap.address) + env->me_flags |= MDBX_FATAL_ERROR; + } + } + } + } + + mdbx_assert(env, txn == env->me_txn0 || txn->mt_owner == 0); + if ((mode & MDBX_END_FREE) != 0 && txn != env->me_txn0) { + txn->mt_signature = 0; + mdbx_free(txn); + } + + return rc; +} + +int mdbx_txn_reset(MDBX_txn *txn) { + int rc = check_txn(txn, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + /* This call is only valid for read-only txns */ + if (unlikely((txn->mt_flags & MDBX_TXN_RDONLY) == 0)) + return MDBX_EINVAL; + + /* LY: don't close DBI-handles */ + rc = mdbx_txn_end(txn, MDBX_END_RESET | MDBX_END_UPDATE); + if (rc == MDBX_SUCCESS) { + mdbx_tassert(txn, txn->mt_signature == MDBX_MT_SIGNATURE); + mdbx_tassert(txn, txn->mt_owner == 0); + } + return rc; +} + +int mdbx_txn_break(MDBX_txn *txn) { + do { + int rc = check_txn(txn, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + txn->mt_flags |= MDBX_TXN_ERROR; + if (txn->mt_flags & MDBX_TXN_RDONLY) + break; + txn = txn->mt_child; + } while (txn); + return MDBX_SUCCESS; +} + +int mdbx_txn_abort(MDBX_txn *txn) { + int rc = check_txn(txn, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) + /* LY: don't close DBI-handles */ + return mdbx_txn_end(txn, MDBX_END_ABORT | MDBX_END_UPDATE | MDBX_END_SLOT | + MDBX_END_FREE); + + if (txn->mt_child) + mdbx_txn_abort(txn->mt_child); + + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + return mdbx_txn_end(txn, MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE); +} + +/* Count all the pages in each DB and in the GC and make sure + * it matches the actual number of pages being used. */ +static __cold int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, + bool dont_filter_gc) { + pgno_t pending = 0; + if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) { + pending = txn->tw.loose_count + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + + (MDBX_PNL_SIZE(txn->tw.retired_pages) - retired_stored); + } + + MDBX_cursor_couple cx; + int rc = mdbx_cursor_init(&cx.outer, txn, FREE_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + pgno_t gc = 0; + MDBX_val key, data; + while ((rc = mdbx_cursor_get(&cx.outer, &key, &data, MDBX_NEXT)) == 0) { + if (!dont_filter_gc) { + if (unlikely(key.iov_len != sizeof(txnid_t))) + return MDBX_CORRUPTED; + txnid_t id = unaligned_peek_u64(4, key.iov_base); + if (txn->tw.lifo_reclaimed) { + for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed); ++i) + if (id == txn->tw.lifo_reclaimed[i]) + goto skip; + } else if (id <= txn->tw.last_reclaimed) + goto skip; + } + + gc += *(pgno_t *)data.iov_base; + skip:; + } + mdbx_tassert(txn, rc == MDBX_NOTFOUND); + + for (MDBX_dbi i = FREE_DBI; i < txn->mt_numdbs; i++) + txn->mt_dbistate[i] &= ~DBI_AUDITED; + + pgno_t used = NUM_METAS; + for (MDBX_dbi i = FREE_DBI; i <= MAIN_DBI; i++) { + if (!(txn->mt_dbistate[i] & DBI_VALID)) + continue; + rc = mdbx_cursor_init(&cx.outer, txn, i); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + txn->mt_dbistate[i] |= DBI_AUDITED; + if (txn->mt_dbs[i].md_root == P_INVALID) + continue; + used += txn->mt_dbs[i].md_branch_pages + txn->mt_dbs[i].md_leaf_pages + + txn->mt_dbs[i].md_overflow_pages; + + if (i != MAIN_DBI) + continue; + rc = mdbx_page_search(&cx.outer, NULL, MDBX_PS_FIRST); + while (rc == MDBX_SUCCESS) { + MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top]; + for (unsigned j = 0; j < page_numkeys(mp); j++) { + MDBX_node *node = page_node(mp, j); + if (node_flags(node) == F_SUBDATA) { + if (unlikely(node_ds(node) != sizeof(MDBX_db))) + return MDBX_CORRUPTED; + MDBX_db db_copy, *db; + memcpy(db = &db_copy, node_data(node), sizeof(db_copy)); + if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) { + for (MDBX_dbi k = txn->mt_numdbs; --k > MAIN_DBI;) { + if ((txn->mt_dbistate[k] & DBI_VALID) && + /* txn->mt_dbxs[k].md_name.iov_len > 0 && */ + node_ks(node) == txn->mt_dbxs[k].md_name.iov_len && + memcmp(node_key(node), txn->mt_dbxs[k].md_name.iov_base, + node_ks(node)) == 0) { + txn->mt_dbistate[k] |= DBI_AUDITED; + if (!(txn->mt_dbistate[k] & MDBX_DBI_STALE)) + db = txn->mt_dbs + k; + break; + } + } + } + used += + db->md_branch_pages + db->md_leaf_pages + db->md_overflow_pages; + } + } + rc = mdbx_cursor_sibling(&cx.outer, SIBLING_RIGHT); + } + mdbx_tassert(txn, rc == MDBX_NOTFOUND); + } + + for (MDBX_dbi i = FREE_DBI; i < txn->mt_numdbs; i++) { + if ((txn->mt_dbistate[i] & (DBI_VALID | DBI_AUDITED | DBI_STALE)) != + DBI_VALID) + continue; + for (MDBX_txn *t = txn; t; t = t->mt_parent) + if (F_ISSET(t->mt_dbistate[i], DBI_DIRTY | DBI_CREAT)) { + used += t->mt_dbs[i].md_branch_pages + t->mt_dbs[i].md_leaf_pages + + t->mt_dbs[i].md_overflow_pages; + txn->mt_dbistate[i] |= DBI_AUDITED; + break; + } + if (!(txn->mt_dbistate[i] & DBI_AUDITED)) { + mdbx_warning("audit %s@%" PRIaTXN + ": unable account dbi %d / \"%*s\", state 0x%02x", + txn->mt_parent ? "nested-" : "", txn->mt_txnid, i, + (int)txn->mt_dbxs[i].md_name.iov_len, + (const char *)txn->mt_dbxs[i].md_name.iov_base, + txn->mt_dbistate[i]); + } + } + + if (pending + gc + used == txn->mt_next_pgno) + return MDBX_SUCCESS; + + if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) + mdbx_error("audit @%" PRIaTXN ": %u(pending) = %u(loose) + " + "%u(reclaimed) + %u(retired-pending) - %u(retired-stored)", + txn->mt_txnid, pending, txn->tw.loose_count, + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), + txn->tw.retired_pages ? MDBX_PNL_SIZE(txn->tw.retired_pages) : 0, + retired_stored); + mdbx_error("audit @%" PRIaTXN ": %" PRIaPGNO "(pending) + %" PRIaPGNO + "(gc) + %" PRIaPGNO "(count) = %" PRIaPGNO "(total) <> %" PRIaPGNO + "(allocated)", + txn->mt_txnid, pending, gc, used, pending + gc + used, + txn->mt_next_pgno); + return MDBX_PROBLEM; +} + +static __always_inline unsigned backlog_size(MDBX_txn *txn) { + return MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + txn->tw.loose_count; +} + +/* LY: Prepare a backlog of pages to modify GC itself, + * while reclaiming is prohibited. It should be enough to prevent search + * in mdbx_page_alloc() during a deleting, when GC tree is unbalanced. */ +static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *gc_cursor, + const size_t pnl_bytes) { + const unsigned linear4list = number_of_ovpages(txn->mt_env, pnl_bytes); + const unsigned backlog4cow = txn->mt_dbs[FREE_DBI].md_depth; + const unsigned backlog4rebalance = backlog4cow + 1; + + if (likely(linear4list == 1 && + backlog_size(txn) > (pnl_bytes + ? backlog4rebalance + : (backlog4cow + backlog4rebalance)))) + return MDBX_SUCCESS; + + mdbx_trace(">> pnl_bytes %zu, backlog %u, 4list %u, 4cow %u, 4rebalance %u", + pnl_bytes, backlog_size(txn), linear4list, backlog4cow, + backlog4rebalance); + + MDBX_val fake_key, fake_val; + fake_key.iov_base = fake_val.iov_base = nullptr; + fake_key.iov_len = sizeof(txnid_t); + fake_val.iov_len = pnl_bytes; + int err = mdbx_cursor_spill(gc_cursor, &fake_key, &fake_val); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + gc_cursor->mc_flags &= ~C_RECLAIMING; + err = mdbx_cursor_touch(gc_cursor); + mdbx_trace("== after-touch, backlog %u, err %d", backlog_size(txn), err); + + if (linear4list > 1 && err == MDBX_SUCCESS) { + err = mdbx_page_alloc(gc_cursor, linear4list, + MDBX_ALLOC_GC | MDBX_ALLOC_CACHE | MDBX_ALLOC_SLOT) + .err; + mdbx_trace("== after-4linear, backlog %u, err %d", backlog_size(txn), err); + } + + while (backlog_size(txn) < backlog4cow + linear4list && err == MDBX_SUCCESS) + err = mdbx_page_alloc(gc_cursor, 1, MDBX_ALLOC_GC | MDBX_ALLOC_SLOT).err; + + gc_cursor->mc_flags |= C_RECLAIMING; + mdbx_trace("<< backlog %u, err %d", backlog_size(txn), err); + return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; +} + +static __inline void clean_reserved_gc_pnl(MDBX_env *env, MDBX_val pnl) { + /* PNL is initially empty, zero out at least the length */ + memset(pnl.iov_base, 0, sizeof(pgno_t)); + if ((env->me_flags & (MDBX_WRITEMAP | MDBX_NOMEMINIT)) == 0) + /* zero out to avoid leaking values from uninitialized malloc'ed memory + * to the file in non-writemap mode if length of the saving page-list + * was changed during space reservation. */ + memset(pnl.iov_base, 0, pnl.iov_len); +} + +/* Cleanup reclaimed GC records, than save the retired-list as of this + * transaction to GC (aka freeDB). This recursive changes the reclaimed-list + * loose-list and retired-list. Keep trying until it stabilizes. */ +static int mdbx_update_gc(MDBX_txn *txn) { + /* txn->tw.reclaimed_pglist[] can grow and shrink during this call. + * txn->tw.last_reclaimed and txn->tw.retired_pages[] can only grow. + * Page numbers cannot disappear from txn->tw.retired_pages[]. */ + MDBX_env *const env = txn->mt_env; + const bool lifo = (env->me_flags & MDBX_LIFORECLAIM) != 0; + const char *dbg_prefix_mode = lifo ? " lifo" : " fifo"; + (void)dbg_prefix_mode; + mdbx_trace("\n>>> @%" PRIaTXN, txn->mt_txnid); + + unsigned retired_stored = 0, loop = 0; + MDBX_cursor_couple couple; + int rc = mdbx_cursor_init(&couple.outer, txn, FREE_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout_notracking; + + couple.outer.mc_flags |= C_RECLAIMING; + couple.outer.mc_next = txn->tw.cursors[FREE_DBI]; + txn->tw.cursors[FREE_DBI] = &couple.outer; + +retry: + ++loop; +retry_noaccount: + mdbx_trace("%s", " >> restart"); + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + if (unlikely(/* paranoia */ loop > ((MDBX_DEBUG > 0) ? 9 : 99))) { + mdbx_error("too more loops %u, bailout", loop); + rc = MDBX_PROBLEM; + goto bailout; + } + + rc = mdbx_prep_backlog(txn, &couple.outer, + MDBX_PNL_SIZEOF(txn->tw.retired_pages)); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + unsigned settled = 0, cleaned_gc_slot = 0, reused_gc_slot = 0, + filled_gc_slot = ~0u; + txnid_t cleaned_gc_id = 0, gc_rid = txn->tw.last_reclaimed; + while (true) { + /* Come back here after each Put() in case retired-list changed */ + MDBX_val key, data; + mdbx_trace("%s", " >> continue"); + + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + if (lifo) { + if (cleaned_gc_slot < (txn->tw.lifo_reclaimed + ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + : 0)) { + settled = 0; + cleaned_gc_slot = 0; + reused_gc_slot = 0; + filled_gc_slot = ~0u; + /* LY: cleanup reclaimed records. */ + do { + cleaned_gc_id = txn->tw.lifo_reclaimed[++cleaned_gc_slot]; + mdbx_tassert(txn, + cleaned_gc_slot > 0 && + cleaned_gc_id < env->me_lck->mti_oldest_reader.weak); + key.iov_base = &cleaned_gc_id; + key.iov_len = sizeof(cleaned_gc_id); + rc = mdbx_cursor_get(&couple.outer, &key, NULL, MDBX_SET); + if (rc == MDBX_NOTFOUND) + continue; + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + rc = mdbx_prep_backlog(txn, &couple.outer, 0); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + mdbx_tassert(txn, + cleaned_gc_id < env->me_lck->mti_oldest_reader.weak); + mdbx_trace("%s.cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode, + cleaned_gc_slot, cleaned_gc_id); + mdbx_tassert(txn, *txn->tw.cursors == &couple.outer); + rc = mdbx_cursor_del(&couple.outer, 0); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } while (cleaned_gc_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + mdbx_txl_sort(txn->tw.lifo_reclaimed); + } + } else { + /* If using records from GC which we have not yet deleted, + * now delete them and any we reserved for tw.reclaimed_pglist. */ + while (cleaned_gc_id <= txn->tw.last_reclaimed) { + gc_rid = cleaned_gc_id; + settled = 0; + rc = mdbx_cursor_first(&couple.outer, &key, NULL); + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc == MDBX_NOTFOUND) + break; + goto bailout; + } + if (!MDBX_DISABLE_PAGECHECKS && + unlikely(key.iov_len != sizeof(txnid_t))) { + rc = MDBX_CORRUPTED; + goto bailout; + } + cleaned_gc_id = unaligned_peek_u64(4, key.iov_base); + if (!MDBX_DISABLE_PAGECHECKS && + unlikely(cleaned_gc_id < MIN_TXNID || cleaned_gc_id > MAX_TXNID)) { + rc = MDBX_CORRUPTED; + goto bailout; + } + if (cleaned_gc_id > txn->tw.last_reclaimed) + break; + if (cleaned_gc_id < txn->tw.last_reclaimed) { + rc = mdbx_prep_backlog(txn, &couple.outer, 0); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + mdbx_tassert(txn, cleaned_gc_id <= txn->tw.last_reclaimed); + mdbx_tassert(txn, cleaned_gc_id < env->me_lck->mti_oldest_reader.weak); + mdbx_trace("%s.cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, + cleaned_gc_id); + mdbx_tassert(txn, *txn->tw.cursors == &couple.outer); + rc = mdbx_cursor_del(&couple.outer, 0); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + } + + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + if (mdbx_audit_enabled()) { + rc = mdbx_audit_ex(txn, retired_stored, false); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + + /* return suitable into unallocated space */ + if (mdbx_refund(txn)) { + mdbx_tassert( + txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + if (mdbx_audit_enabled()) { + rc = mdbx_audit_ex(txn, retired_stored, false); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + } + + /* handle loose pages - put ones into the reclaimed- or retired-list */ + if (txn->tw.loose_pages) { + /* Return loose page numbers to tw.reclaimed_pglist, + * though usually none are left at this point. + * The pages themselves remain in dirtylist. */ + if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) { + if (txn->tw.loose_count > 0) { + /* Put loose page numbers in tw.retired_pages, + * since unable to return them to tw.reclaimed_pglist. */ + if (unlikely((rc = mdbx_pnl_need(&txn->tw.retired_pages, + txn->tw.loose_count)) != 0)) + goto bailout; + for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) + mdbx_pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); + mdbx_trace("%s: append %u loose-pages to retired-pages", + dbg_prefix_mode, txn->tw.loose_count); + } + } else { + /* Room for loose pages + temp PNL with same */ + rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, + 2 * txn->tw.loose_count + 2); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + MDBX_PNL loose = txn->tw.reclaimed_pglist + + MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) - + txn->tw.loose_count - 1; + unsigned count = 0; + for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) { + mdbx_tassert(txn, mp->mp_flags == P_LOOSE); + loose[++count] = mp->mp_pgno; + } + mdbx_tassert(txn, count == txn->tw.loose_count); + MDBX_PNL_SIZE(loose) = count; + mdbx_pnl_sort(loose); + mdbx_pnl_xmerge(txn->tw.reclaimed_pglist, loose); + mdbx_trace("%s: append %u loose-pages to reclaimed-pages", + dbg_prefix_mode, txn->tw.loose_count); + } + + /* filter-out list of dirty-pages from loose-pages */ + MDBX_dpl *const dl = txn->tw.dirtylist; + unsigned w = 0; + for (unsigned r = w; ++r <= dl->length;) { + MDBX_page *dp = dl->items[r].ptr; + mdbx_tassert(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); + mdbx_tassert(txn, dpl_endpgno(dl, r) <= txn->mt_next_pgno); + if ((dp->mp_flags & P_LOOSE) == 0) { + if (++w != r) + dl->items[w] = dl->items[r]; + } else { + mdbx_tassert(txn, dp->mp_flags == P_LOOSE); + if ((env->me_flags & MDBX_WRITEMAP) == 0) + mdbx_dpage_free(env, dp, 1); + } + } + mdbx_trace("%s: filtered-out loose-pages from %u -> %u dirty-pages", + dbg_prefix_mode, dl->length, w); + mdbx_tassert(txn, txn->tw.loose_count == dl->length - w); + dpl_setlen(dl, w); + dl->sorted = 0; + txn->tw.dirtyroom += txn->tw.loose_count; + mdbx_tassert(txn, + txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + txn->tw.loose_pages = NULL; + txn->tw.loose_count = 0; +#if MDBX_ENABLE_REFUND + txn->tw.loose_refund_wl = 0; +#endif /* MDBX_ENABLE_REFUND */ + } + + const unsigned amount = (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist); + /* handle retired-list - store ones into single gc-record */ + if (retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)) { + if (unlikely(!retired_stored)) { + /* Make sure last page of GC is touched and on retired-list */ + couple.outer.mc_flags &= ~C_RECLAIMING; + rc = mdbx_page_search(&couple.outer, NULL, + MDBX_PS_LAST | MDBX_PS_MODIFY); + couple.outer.mc_flags |= C_RECLAIMING; + if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND) + goto bailout; + } + /* Write to last page of GC */ + key.iov_len = sizeof(txn->mt_txnid); + key.iov_base = &txn->mt_txnid; + do { + data.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); + mdbx_prep_backlog(txn, &couple.outer, data.iov_len); + rc = mdbx_cursor_put(&couple.outer, &key, &data, MDBX_RESERVE); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + /* Retry if tw.retired_pages[] grew during the Put() */ + } while (data.iov_len < MDBX_PNL_SIZEOF(txn->tw.retired_pages)); + + retired_stored = (unsigned)MDBX_PNL_SIZE(txn->tw.retired_pages); + mdbx_pnl_sort(txn->tw.retired_pages); + mdbx_assert(env, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages)); + memcpy(data.iov_base, txn->tw.retired_pages, data.iov_len); + + mdbx_trace("%s.put-retired #%u @ %" PRIaTXN, dbg_prefix_mode, + retired_stored, txn->mt_txnid); + + if (mdbx_log_enabled(MDBX_LOG_EXTRA)) { + unsigned i = retired_stored; + mdbx_debug_extra("PNL write txn %" PRIaTXN " root %" PRIaPGNO + " num %u, PNL", + txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); + for (; i; i--) + mdbx_debug_extra_print(" %" PRIaPGNO, txn->tw.retired_pages[i]); + mdbx_debug_extra_print("%s\n", "."); + } + if (unlikely(amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) { + mdbx_trace("%s.reclaimed-list changed %u -> %u, retry", dbg_prefix_mode, + amount, (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); + goto retry_noaccount /* rare case, but avoids GC fragmentation and one + cycle. */ + ; + } + continue; + } + + /* handle reclaimed and lost pages - merge and store both into gc */ + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + mdbx_tassert(txn, txn->tw.loose_count == 0); + + mdbx_trace("%s", " >> reserving"); + if (mdbx_audit_enabled()) { + rc = mdbx_audit_ex(txn, retired_stored, false); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + const unsigned left = amount - settled; + mdbx_trace("%s: amount %u, settled %d, left %d, lifo-reclaimed-slots %u, " + "reused-gc-slots %u", + dbg_prefix_mode, amount, settled, (int)left, + txn->tw.lifo_reclaimed + ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + : 0, + reused_gc_slot); + if (0 >= (int)left) + break; + + const unsigned prefer_max_scatter = 257; + txnid_t reservation_gc_id; + if (lifo) { + if (txn->tw.lifo_reclaimed == nullptr) { + txn->tw.lifo_reclaimed = mdbx_txl_alloc(); + if (unlikely(!txn->tw.lifo_reclaimed)) { + rc = MDBX_ENOMEM; + goto bailout; + } + } + if ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < + prefer_max_scatter && + left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - + reused_gc_slot) * + env->me_maxgc_ov1page) { + + /* LY: need just a txn-id for save page list. */ + bool need_cleanup = false; + txnid_t snap_oldest; + retry_rid: + couple.outer.mc_flags &= ~C_RECLAIMING; + do { + snap_oldest = mdbx_find_oldest(txn); + rc = + mdbx_page_alloc(&couple.outer, 0, MDBX_ALLOC_GC | MDBX_ALLOC_SLOT) + .err; + if (likely(rc == MDBX_SUCCESS)) { + mdbx_trace("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode, + MDBX_PNL_LAST(txn->tw.lifo_reclaimed)); + need_cleanup = true; + } + } while (rc == MDBX_SUCCESS && + (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < + prefer_max_scatter && + left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - + reused_gc_slot) * + env->me_maxgc_ov1page); + couple.outer.mc_flags |= C_RECLAIMING; + + if (likely(rc == MDBX_SUCCESS)) { + mdbx_trace("%s: got enough from GC.", dbg_prefix_mode); + continue; + } else if (unlikely(rc != MDBX_NOTFOUND)) + /* LY: some troubles... */ + goto bailout; + + if (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { + if (need_cleanup) { + mdbx_txl_sort(txn->tw.lifo_reclaimed); + cleaned_gc_slot = 0; + } + gc_rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed); + } else { + mdbx_tassert(txn, txn->tw.last_reclaimed == 0); + if (unlikely(mdbx_find_oldest(txn) != snap_oldest)) + /* should retry mdbx_page_alloc(MDBX_ALLOC_GC) + * if the oldest reader changes since the last attempt */ + goto retry_rid; + /* no reclaimable GC entries, + * therefore no entries with ID < mdbx_find_oldest(txn) */ + txn->tw.last_reclaimed = gc_rid = snap_oldest - 1; + mdbx_trace("%s: none recycled yet, set rid to @%" PRIaTXN, + dbg_prefix_mode, gc_rid); + } + + /* LY: GC is empty, will look any free txn-id in high2low order. */ + while (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && + left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - + reused_gc_slot) * + env->me_maxgc_ov1page) { + if (unlikely(gc_rid < 2)) { + if (unlikely(MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) <= + reused_gc_slot)) { + mdbx_notice("** restart: reserve depleted (reused_gc_slot %u >= " + "lifo_reclaimed %u" PRIaTXN, + reused_gc_slot, + (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + goto retry; + } + break; + } + + mdbx_tassert(txn, gc_rid >= MIN_TXNID && gc_rid <= MAX_TXNID); + --gc_rid; + key.iov_base = &gc_rid; + key.iov_len = sizeof(gc_rid); + rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_SET_KEY); + if (unlikely(rc == MDBX_SUCCESS)) { + mdbx_debug("%s: GC's id %" PRIaTXN + " is used, continue bottom-up search", + dbg_prefix_mode, gc_rid); + ++gc_rid; + rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_FIRST); + if (rc == MDBX_NOTFOUND) { + mdbx_debug("%s: GC is empty", dbg_prefix_mode); + break; + } + if (unlikely(rc != MDBX_SUCCESS || + key.iov_len != sizeof(mdbx_tid_t))) { + rc = MDBX_CORRUPTED; + goto bailout; + } + txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); + if (!MDBX_DISABLE_PAGECHECKS && + unlikely(gc_first < MIN_TXNID || gc_first > MAX_TXNID)) { + rc = MDBX_CORRUPTED; + goto bailout; + } + if (gc_first < 2) { + mdbx_debug("%s: no free GC's id(s) less than %" PRIaTXN, + dbg_prefix_mode, gc_rid); + break; + } + gc_rid = gc_first - 1; + } + + rc = mdbx_txl_append(&txn->tw.lifo_reclaimed, gc_rid); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + if (reused_gc_slot) + /* rare case, but it is better to clear and re-create GC entries + * with less fragmentation. */ + need_cleanup = true; + else + cleaned_gc_slot += + 1 /* mark cleanup is not needed for added slot. */; + + mdbx_trace("%s: append @%" PRIaTXN + " to lifo-reclaimed, cleaned-gc-slot = %u", + dbg_prefix_mode, gc_rid, cleaned_gc_slot); + } + + if (need_cleanup) { + cleaned_gc_slot = 0; + mdbx_trace("%s: restart inner-loop to clear and re-create GC entries", + dbg_prefix_mode); + continue; + } + } + + const unsigned i = + (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - reused_gc_slot; + mdbx_tassert(txn, i > 0 && i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + reservation_gc_id = txn->tw.lifo_reclaimed[i]; + mdbx_trace("%s: take @%" PRIaTXN " from lifo-reclaimed[%u]", + dbg_prefix_mode, reservation_gc_id, i); + } else { + mdbx_tassert(txn, txn->tw.lifo_reclaimed == NULL); + if (unlikely(gc_rid == 0)) { + gc_rid = mdbx_find_oldest(txn) - 1; + rc = mdbx_cursor_get(&couple.outer, &key, NULL, MDBX_FIRST); + if (rc == MDBX_SUCCESS) { + if (!MDBX_DISABLE_PAGECHECKS && + unlikely(key.iov_len != sizeof(txnid_t))) { + rc = MDBX_CORRUPTED; + goto bailout; + } + txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); + if (!MDBX_DISABLE_PAGECHECKS && + unlikely(gc_first < MIN_TXNID || gc_first > MAX_TXNID)) { + rc = MDBX_CORRUPTED; + goto bailout; + } + if (gc_rid >= gc_first) + gc_rid = gc_first - 1; + if (unlikely(gc_rid == 0)) { + mdbx_error("%s", "** no GC tail-space to store"); + goto retry; + } + } else if (rc != MDBX_NOTFOUND) + goto bailout; + txn->tw.last_reclaimed = gc_rid; + } + reservation_gc_id = gc_rid--; + mdbx_trace("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode, + reservation_gc_id); + } + ++reused_gc_slot; + + unsigned chunk = left; + if (unlikely(chunk > env->me_maxgc_ov1page)) { + const unsigned avail_gc_slots = + txn->tw.lifo_reclaimed + ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - + reused_gc_slot + 1 + : (gc_rid < INT16_MAX) ? (unsigned)gc_rid : INT16_MAX; + if (avail_gc_slots > 1) { + if (chunk < env->me_maxgc_ov1page * 2) + chunk /= 2; + else { + const unsigned threshold = + env->me_maxgc_ov1page * ((avail_gc_slots < prefer_max_scatter) + ? avail_gc_slots + : prefer_max_scatter); + if (left < threshold) + chunk = env->me_maxgc_ov1page; + else { + const unsigned tail = left - threshold + env->me_maxgc_ov1page + 1; + unsigned span = 1; + unsigned avail = (unsigned)((pgno2bytes(env, span) - PAGEHDRSZ) / + sizeof(pgno_t)) /*- 1 + span */; + if (tail > avail) { + for (unsigned i = amount - span; i > 0; --i) { + if (MDBX_PNL_ASCENDING + ? (txn->tw.reclaimed_pglist[i] + span) + : (txn->tw.reclaimed_pglist[i] - span) == + txn->tw.reclaimed_pglist[i + span]) { + span += 1; + avail = (unsigned)((pgno2bytes(env, span) - PAGEHDRSZ) / + sizeof(pgno_t)) - + 1 + span; + if (avail >= tail) + break; + } + } + } + + chunk = (avail >= tail) ? tail - span + : (avail_gc_slots > 3 && + reused_gc_slot < prefer_max_scatter - 3) + ? avail - span + : tail; + } + } + } + } + mdbx_tassert(txn, chunk > 0); + + mdbx_trace("%s: gc_rid %" PRIaTXN ", reused_gc_slot %u, reservation-id " + "%" PRIaTXN, + dbg_prefix_mode, gc_rid, reused_gc_slot, reservation_gc_id); + + mdbx_trace("%s: chunk %u, gc-per-ovpage %u", dbg_prefix_mode, chunk, + env->me_maxgc_ov1page); + + mdbx_tassert(txn, reservation_gc_id < env->me_lck->mti_oldest_reader.weak); + if (unlikely( + reservation_gc_id < 1 || + reservation_gc_id >= + atomic_load64(&env->me_lck->mti_oldest_reader, mo_Relaxed))) { + mdbx_error("** internal error (reservation_gc_id %" PRIaTXN ")", + reservation_gc_id); + rc = MDBX_PROBLEM; + goto bailout; + } + + key.iov_len = sizeof(reservation_gc_id); + key.iov_base = &reservation_gc_id; + data.iov_len = (chunk + 1) * sizeof(pgno_t); + mdbx_trace("%s.reserve: %u [%u...%u] @%" PRIaTXN, dbg_prefix_mode, chunk, + settled + 1, settled + chunk + 1, reservation_gc_id); + mdbx_prep_backlog(txn, &couple.outer, data.iov_len); + rc = mdbx_cursor_put(&couple.outer, &key, &data, + MDBX_RESERVE | MDBX_NOOVERWRITE); + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + clean_reserved_gc_pnl(env, data); + settled += chunk; + mdbx_trace("%s.settled %u (+%u), continue", dbg_prefix_mode, settled, + chunk); + + if (txn->tw.lifo_reclaimed && + unlikely(amount < MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) { + mdbx_notice("** restart: reclaimed-list growth %u -> %u", amount, + (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); + goto retry_noaccount; + } + + continue; + } + + mdbx_tassert( + txn, + cleaned_gc_slot == + (txn->tw.lifo_reclaimed ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) : 0)); + + mdbx_trace("%s", " >> filling"); + /* Fill in the reserved records */ + filled_gc_slot = + txn->tw.lifo_reclaimed + ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - reused_gc_slot + : reused_gc_slot; + rc = MDBX_SUCCESS; + mdbx_tassert(txn, + mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) { + MDBX_val key, data; + key.iov_len = data.iov_len = 0; /* avoid MSVC warning */ + key.iov_base = data.iov_base = NULL; + + const unsigned amount = MDBX_PNL_SIZE(txn->tw.reclaimed_pglist); + unsigned left = amount; + if (txn->tw.lifo_reclaimed == nullptr) { + mdbx_tassert(txn, lifo == 0); + rc = mdbx_cursor_first(&couple.outer, &key, &data); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } else { + mdbx_tassert(txn, lifo != 0); + } + + while (true) { + txnid_t fill_gc_id; + mdbx_trace("%s: left %u of %u", dbg_prefix_mode, left, + (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); + if (txn->tw.lifo_reclaimed == nullptr) { + mdbx_tassert(txn, lifo == 0); + fill_gc_id = unaligned_peek_u64(4, key.iov_base); + if (filled_gc_slot-- == 0 || fill_gc_id > txn->tw.last_reclaimed) { + mdbx_notice( + "** restart: reserve depleted (filled_slot %u, fill_id %" PRIaTXN + " > last_reclaimed %" PRIaTXN, + filled_gc_slot, fill_gc_id, txn->tw.last_reclaimed); + goto retry; + } + } else { + mdbx_tassert(txn, lifo != 0); + if (++filled_gc_slot > + (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { + mdbx_notice("** restart: reserve depleted (filled_gc_slot %u > " + "lifo_reclaimed %u" PRIaTXN, + filled_gc_slot, + (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + goto retry; + } + fill_gc_id = txn->tw.lifo_reclaimed[filled_gc_slot]; + mdbx_trace("%s.seek-reservation @%" PRIaTXN " at lifo_reclaimed[%u]", + dbg_prefix_mode, fill_gc_id, filled_gc_slot); + key.iov_base = &fill_gc_id; + key.iov_len = sizeof(fill_gc_id); + rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_SET_KEY); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + mdbx_tassert(txn, cleaned_gc_slot == + (txn->tw.lifo_reclaimed + ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + : 0)); + mdbx_tassert(txn, fill_gc_id > 0 && + fill_gc_id < env->me_lck->mti_oldest_reader.weak); + key.iov_base = &fill_gc_id; + key.iov_len = sizeof(fill_gc_id); + + mdbx_tassert(txn, data.iov_len >= sizeof(pgno_t) * 2); + couple.outer.mc_flags |= C_GCFREEZE; + unsigned chunk = (unsigned)(data.iov_len / sizeof(pgno_t)) - 1; + if (unlikely(chunk > left)) { + mdbx_trace("%s: chunk %u > left %u, @%" PRIaTXN, dbg_prefix_mode, chunk, + left, fill_gc_id); + if ((loop < 5 && chunk - left > loop / 2) || + chunk - left > env->me_maxgc_ov1page) { + data.iov_len = (left + 1) * sizeof(pgno_t); + if (loop < 7) + couple.outer.mc_flags &= ~C_GCFREEZE; + } + chunk = left; + } + rc = mdbx_cursor_put(&couple.outer, &key, &data, + MDBX_CURRENT | MDBX_RESERVE); + couple.outer.mc_flags &= ~C_GCFREEZE; + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + clean_reserved_gc_pnl(env, data); + + if (unlikely(txn->tw.loose_count || + amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) { + mdbx_notice("** restart: reclaimed-list growth (%u -> %u, loose +%u)", + amount, MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), + txn->tw.loose_count); + goto retry; + } + if (unlikely(txn->tw.lifo_reclaimed + ? cleaned_gc_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + : cleaned_gc_id < txn->tw.last_reclaimed)) { + mdbx_notice("%s", "** restart: reclaimed-slots changed"); + goto retry; + } + if (unlikely(retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages))) { + mdbx_tassert(txn, + retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)); + mdbx_notice("** restart: retired-list growth (%u -> %u)", + retired_stored, MDBX_PNL_SIZE(txn->tw.retired_pages)); + goto retry; + } + + pgno_t *dst = data.iov_base; + *dst++ = chunk; + pgno_t *src = MDBX_PNL_BEGIN(txn->tw.reclaimed_pglist) + left - chunk; + memcpy(dst, src, chunk * sizeof(pgno_t)); + pgno_t *from = src, *to = src + chunk; + mdbx_trace("%s.fill: %u [ %u:%" PRIaPGNO "...%u:%" PRIaPGNO + "] @%" PRIaTXN, + dbg_prefix_mode, chunk, + (unsigned)(from - txn->tw.reclaimed_pglist), from[0], + (unsigned)(to - txn->tw.reclaimed_pglist), to[-1], fill_gc_id); + + left -= chunk; + if (mdbx_audit_enabled()) { + rc = mdbx_audit_ex(txn, retired_stored + amount - left, true); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + if (left == 0) { + rc = MDBX_SUCCESS; + break; + } + + if (txn->tw.lifo_reclaimed == nullptr) { + mdbx_tassert(txn, lifo == 0); + rc = mdbx_cursor_next(&couple.outer, &key, &data, MDBX_NEXT); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } else { + mdbx_tassert(txn, lifo != 0); + } + } + } + + mdbx_tassert(txn, rc == MDBX_SUCCESS); + if (unlikely(txn->tw.loose_count != 0 || + filled_gc_slot != + (txn->tw.lifo_reclaimed + ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + : 0))) { + mdbx_notice("** restart: reserve excess (filled-slot %u, loose-count %u)", + filled_gc_slot, txn->tw.loose_count); + goto retry; + } + + mdbx_tassert(txn, + txn->tw.lifo_reclaimed == NULL || + cleaned_gc_slot == MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + +bailout: + txn->tw.cursors[FREE_DBI] = couple.outer.mc_next; + +bailout_notracking: + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) = 0; + mdbx_trace("<<< %u loops, rc = %d", loop, rc); + return rc; +} + +static int mdbx_txn_write(MDBX_txn *txn, struct mdbx_iov_ctx *ctx) { + MDBX_dpl *const dl = + (txn->mt_flags & MDBX_WRITEMAP) ? txn->tw.dirtylist : mdbx_dpl_sort(txn); + int rc = MDBX_SUCCESS; + unsigned r, w; + for (w = 0, r = 1; r <= dl->length; ++r) { + MDBX_page *dp = dl->items[r].ptr; + if (dp->mp_flags & P_LOOSE) { + dl->items[++w] = dl->items[r]; + continue; + } + unsigned npages = dpl_npages(dl, r); + rc = iov_page(txn, ctx, dp, npages); + if (unlikely(rc != MDBX_SUCCESS)) + break; + } + + if (ctx->iov_items) + rc = mdbx_iov_write(txn, ctx); + + while (r <= dl->length) + dl->items[++w] = dl->items[r++]; + + dl->sorted = dpl_setlen(dl, w); + txn->tw.dirtyroom += r - 1 - w; + mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + return rc; +} + +/* Check txn and dbi arguments to a function */ +static __always_inline bool check_dbi(MDBX_txn *txn, MDBX_dbi dbi, + unsigned validity) { + if (likely(dbi < txn->mt_numdbs)) + return likely((txn->mt_dbistate[dbi] & validity) && + !TXN_DBI_CHANGED(txn, dbi) && + (txn->mt_dbxs[dbi].md_name.iov_base || dbi < CORE_DBS)); + + return dbi_import(txn, dbi); +} + +#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API +int mdbx_txn_commit(MDBX_txn *txn) { return __inline_mdbx_txn_commit(txn); } +#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ + +/* Merge child txn into parent */ +static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, + const unsigned parent_retired_len) { + MDBX_dpl *const src = mdbx_dpl_sort(txn); + + /* Remove refunded pages from parent's dirty list */ + MDBX_dpl *const dst = mdbx_dpl_sort(parent); + if (MDBX_ENABLE_REFUND) { + unsigned n = dst->length; + while (n && dst->items[n].pgno >= parent->mt_next_pgno) { + if (!(txn->mt_env->me_flags & MDBX_WRITEMAP)) { + MDBX_page *dp = dst->items[n].ptr; + mdbx_dpage_free(txn->mt_env, dp, dpl_npages(dst, n)); + } + --n; + } + parent->tw.dirtyroom += dst->sorted - n; + dst->sorted = dpl_setlen(dst, n); + mdbx_tassert(parent, + parent->tw.dirtyroom + parent->tw.dirtylist->length == + (parent->mt_parent ? parent->mt_parent->tw.dirtyroom + : parent->mt_env->me_options.dp_limit)); + } + + /* Remove reclaimed pages from parent's dirty list */ + const MDBX_PNL reclaimed_list = parent->tw.reclaimed_pglist; + mdbx_dpl_sift(parent, reclaimed_list, false); + + /* Move retired pages from parent's dirty & spilled list to reclaimed */ + unsigned r, w, d, s, l; + for (r = w = parent_retired_len; + ++r <= MDBX_PNL_SIZE(parent->tw.retired_pages);) { + const pgno_t pgno = parent->tw.retired_pages[r]; + const unsigned di = mdbx_dpl_exist(parent, pgno); + const unsigned si = (!di && unlikely(parent->tw.spill_pages)) + ? mdbx_pnl_exist(parent->tw.spill_pages, pgno << 1) + : 0; + unsigned npages; + const char *kind; + if (di) { + MDBX_page *dp = dst->items[di].ptr; + mdbx_tassert(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | + P_OVERFLOW | P_SPILLED)) == 0); + npages = dpl_npages(dst, di); + mdbx_page_wash(parent, di, dp, npages); + kind = "dirty"; + l = 1; + if (unlikely(npages > l)) { + /* OVERFLOW-страница могла быть переиспользована по частям. Тогда + * в retired-списке может быть только начало последовательности, + * а остаток растащен по dirty, spilled и reclaimed спискам. Поэтому + * переносим в reclaimed с проверкой на обрыв последовательности. + * В любом случае, все осколки будут учтены и отфильтрованы, т.е. если + * страница была разбита на части, то важно удалить dirty-элемент, + * а все осколки будут учтены отдельно. */ + + /* Список retired страниц не сортирован, но для ускорения сортировки + * дополняется в соответствии с MDBX_PNL_ASCENDING */ +#if MDBX_PNL_ASCENDING + const unsigned len = MDBX_PNL_SIZE(parent->tw.retired_pages); + while (r < len && parent->tw.retired_pages[r + 1] == pgno + l) { + ++r; + if (++l == npages) + break; + } +#else + while (w > parent_retired_len && + parent->tw.retired_pages[w - 1] == pgno + l) { + --w; + if (++l == npages) + break; + } +#endif + } + } else if (unlikely(si)) { + l = npages = 1; + mdbx_spill_remove(parent, si, 1); + kind = "spilled"; + } else { + parent->tw.retired_pages[++w] = pgno; + continue; + } + + mdbx_debug("reclaim retired parent's %u->%u %s page %" PRIaPGNO, npages, l, + kind, pgno); + int err = mdbx_pnl_insert_range(&parent->tw.reclaimed_pglist, pgno, l); + mdbx_ensure(txn->mt_env, err == MDBX_SUCCESS); + } + MDBX_PNL_SIZE(parent->tw.retired_pages) = w; + + /* Filter-out parent spill list */ + if (parent->tw.spill_pages && MDBX_PNL_SIZE(parent->tw.spill_pages) > 0) { + const MDBX_PNL sl = mdbx_spill_purge(parent); + unsigned len = MDBX_PNL_SIZE(sl); + if (len) { + /* Remove refunded pages from parent's spill list */ + if (MDBX_ENABLE_REFUND && + MDBX_PNL_MOST(sl) >= (parent->mt_next_pgno << 1)) { +#if MDBX_PNL_ASCENDING + unsigned i = MDBX_PNL_SIZE(sl); + assert(MDBX_PNL_MOST(sl) == MDBX_PNL_LAST(sl)); + do { + if ((sl[i] & 1) == 0) + mdbx_debug("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); + i -= 1; + } while (i && sl[i] >= (parent->mt_next_pgno << 1)); + MDBX_PNL_SIZE(sl) = i; +#else + assert(MDBX_PNL_MOST(sl) == MDBX_PNL_FIRST(sl)); + unsigned i = 0; + do { + ++i; + if ((sl[i] & 1) == 0) + mdbx_debug("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); + } while (i < len && sl[i + 1] >= (parent->mt_next_pgno << 1)); + MDBX_PNL_SIZE(sl) = len -= i; + memmove(sl + 1, sl + 1 + i, len * sizeof(sl[0])); +#endif + } + mdbx_tassert(txn, mdbx_pnl_check4assert(sl, parent->mt_next_pgno << 1)); + + /* Remove reclaimed pages from parent's spill list */ + s = MDBX_PNL_SIZE(sl), r = MDBX_PNL_SIZE(reclaimed_list); + /* Scanning from end to begin */ + while (s && r) { + if (sl[s] & 1) { + --s; + continue; + } + const pgno_t spilled_pgno = sl[s] >> 1; + const pgno_t reclaimed_pgno = reclaimed_list[r]; + if (reclaimed_pgno != spilled_pgno) { + const bool cmp = MDBX_PNL_ORDERED(spilled_pgno, reclaimed_pgno); + s -= !cmp; + r -= cmp; + } else { + mdbx_debug("remove reclaimed parent's spilled page %" PRIaPGNO, + reclaimed_pgno); + mdbx_spill_remove(parent, s, 1); + --s; + --r; + } + } + + /* Remove anything in our dirty list from parent's spill list */ + /* Scanning spill list in descend order */ + const int step = MDBX_PNL_ASCENDING ? -1 : 1; + s = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(sl) : 1; + d = src->length; + while (d && (MDBX_PNL_ASCENDING ? s > 0 : s <= MDBX_PNL_SIZE(sl))) { + if (sl[s] & 1) { + s += step; + continue; + } + const pgno_t spilled_pgno = sl[s] >> 1; + const pgno_t dirty_pgno_form = src->items[d].pgno; + const unsigned npages = dpl_npages(src, d); + const pgno_t dirty_pgno_to = dirty_pgno_form + npages; + if (dirty_pgno_form > spilled_pgno) { + --d; + continue; + } + if (dirty_pgno_to <= spilled_pgno) { + s += step; + continue; + } + + mdbx_debug("remove dirtied parent's spilled %u page %" PRIaPGNO, npages, + dirty_pgno_form); + mdbx_spill_remove(parent, s, 1); + s += step; + } + + /* Squash deleted pagenums if we deleted any */ + mdbx_spill_purge(parent); + } + } + + /* Remove anything in our spill list from parent's dirty list */ + if (txn->tw.spill_pages) { + mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.spill_pages, + parent->mt_next_pgno << 1)); + mdbx_dpl_sift(parent, txn->tw.spill_pages, true); + mdbx_tassert(parent, + parent->tw.dirtyroom + parent->tw.dirtylist->length == + (parent->mt_parent ? parent->mt_parent->tw.dirtyroom + : parent->mt_env->me_options.dp_limit)); + } + + /* Find length of merging our dirty list with parent's and release + * filter-out pages */ + for (l = 0, d = dst->length, s = src->length; d > 0 && s > 0;) { + MDBX_page *sp = src->items[s].ptr; + mdbx_tassert(parent, + (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | + P_LOOSE | P_SPILLED)) == 0); + const unsigned s_npages = dpl_npages(src, s); + const pgno_t s_pgno = src->items[s].pgno; + + MDBX_page *dp = dst->items[d].ptr; + mdbx_tassert(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | + P_OVERFLOW | P_SPILLED)) == 0); + const unsigned d_npages = dpl_npages(dst, d); + const pgno_t d_pgno = dst->items[d].pgno; + + if (d_pgno >= s_pgno + s_npages) { + --d; + ++l; + } else if (d_pgno + d_npages <= s_pgno) { + if (sp->mp_flags != P_LOOSE) { + sp->mp_txnid = parent->mt_front; + sp->mp_flags &= ~P_SPILLED; + } + --s; + ++l; + } else { + dst->items[d--].ptr = nullptr; + if ((txn->mt_flags & MDBX_WRITEMAP) == 0) + mdbx_dpage_free(txn->mt_env, dp, d_npages); + } + } + assert(dst->sorted == dst->length); + mdbx_tassert(parent, dst->detent >= l + d + s); + dst->sorted = l + d + s; /* the merged length */ + + while (s > 0) { + MDBX_page *sp = src->items[s].ptr; + mdbx_tassert(parent, + (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | + P_LOOSE | P_SPILLED)) == 0); + if (sp->mp_flags != P_LOOSE) { + sp->mp_txnid = parent->mt_front; + sp->mp_flags &= ~P_SPILLED; + } + --s; + } + + /* Merge our dirty list into parent's, i.e. merge(dst, src) -> dst */ + if (dst->sorted >= dst->length) { + /* from end to begin with dst extending */ + for (l = dst->sorted, s = src->length, d = dst->length; s > 0 && d > 0;) { + if (unlikely(l <= d)) { + /* squash to get a gap of free space for merge */ + for (r = w = 1; r <= d; ++r) + if (dst->items[r].ptr) { + if (w != r) { + dst->items[w] = dst->items[r]; + dst->items[r].ptr = nullptr; + } + ++w; + } + mdbx_notice("squash to begin for extending-merge %u -> %u", d, w - 1); + d = w - 1; + continue; + } + assert(l > d); + if (dst->items[d].ptr) { + dst->items[l--] = (dst->items[d].pgno > src->items[s].pgno) + ? dst->items[d--] + : src->items[s--]; + } else + --d; + } + if (s > 0) { + assert(l == s); + while (d > 0) { + assert(dst->items[d].ptr == nullptr); + --d; + } + do { + assert(l > 0); + dst->items[l--] = src->items[s--]; + } while (s > 0); + } else { + assert(l == d); + while (l > 0) { + assert(dst->items[l].ptr != nullptr); + --l; + } + } + } else { + /* from begin to end with dst shrinking (a lot of new overflow pages) */ + for (l = s = d = 1; s <= src->length && d <= dst->length;) { + if (unlikely(l >= d)) { + /* squash to get a gap of free space for merge */ + for (r = w = dst->length; r >= d; --r) + if (dst->items[r].ptr) { + if (w != r) { + dst->items[w] = dst->items[r]; + dst->items[r].ptr = nullptr; + } + --w; + } + mdbx_notice("squash to end for shrinking-merge %u -> %u", d, w + 1); + d = w + 1; + continue; + } + assert(l < d); + if (dst->items[d].ptr) { + dst->items[l++] = (dst->items[d].pgno < src->items[s].pgno) + ? dst->items[d++] + : src->items[s++]; + } else + ++d; + } + if (s <= src->length) { + assert(dst->sorted - l == src->length - s); + while (d <= dst->length) { + assert(dst->items[d].ptr == nullptr); + --d; + } + do { + assert(l <= dst->sorted); + dst->items[l++] = src->items[s++]; + } while (s <= src->length); + } else { + assert(dst->sorted - l == dst->length - d); + while (l <= dst->sorted) { + assert(l <= d && d <= dst->length && dst->items[d].ptr); + dst->items[l++] = dst->items[d++]; + } + } + } + parent->tw.dirtyroom -= dst->sorted - dst->length; + assert(parent->tw.dirtyroom <= parent->mt_env->me_options.dp_limit); + dpl_setlen(dst, dst->sorted); + parent->tw.dirtylru = txn->tw.dirtylru; + mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + mdbx_dpl_free(txn); + + if (txn->tw.spill_pages) { + if (parent->tw.spill_pages) { + /* Must not fail since space was preserved above. */ + mdbx_pnl_xmerge(parent->tw.spill_pages, txn->tw.spill_pages); + mdbx_pnl_free(txn->tw.spill_pages); + } else { + parent->tw.spill_pages = txn->tw.spill_pages; + parent->tw.spill_least_removed = txn->tw.spill_least_removed; + } + mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + } + + parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; + if (parent->tw.spill_pages) { + assert(mdbx_pnl_check4assert(parent->tw.spill_pages, + parent->mt_next_pgno << 1)); + if (MDBX_PNL_SIZE(parent->tw.spill_pages)) + parent->mt_flags |= MDBX_TXN_SPILLS; + } +} + +int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { + STATIC_ASSERT(MDBX_TXN_FINISHED == + MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR); + const uint64_t ts_0 = latency ? mdbx_osal_monotime() : 0; + uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0; + uint32_t audit_duration = 0; + + int rc = check_txn(txn, MDBX_TXN_FINISHED); + if (unlikely(rc != MDBX_SUCCESS)) + goto provide_latency; + + if (unlikely(txn->mt_flags & MDBX_TXN_ERROR)) { + rc = MDBX_RESULT_TRUE; + goto fail; + } + + MDBX_env *env = txn->mt_env; +#if MDBX_ENV_CHECKPID + if (unlikely(env->me_pid != mdbx_getpid())) { + env->me_flags |= MDBX_FATAL_ERROR; + rc = MDBX_PANIC; + goto provide_latency; + } +#endif /* MDBX_ENV_CHECKPID */ + + /* mdbx_txn_end() mode for a commit which writes nothing */ + unsigned end_mode = + MDBX_END_PURE_COMMIT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE; + if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) + goto done; + + if (txn->mt_child) { + rc = mdbx_txn_commit_ex(txn->mt_child, NULL); + mdbx_tassert(txn, txn->mt_child == NULL); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + + if (unlikely(txn != env->me_txn)) { + mdbx_debug("%s", "attempt to commit unknown transaction"); + rc = MDBX_EINVAL; + goto fail; + } + + if (txn->mt_parent) { + mdbx_tassert(txn, mdbx_audit_ex(txn, 0, false) == 0); + mdbx_assert(env, txn != env->me_txn0); + MDBX_txn *const parent = txn->mt_parent; + mdbx_assert(env, parent->mt_signature == MDBX_MT_SIGNATURE); + mdbx_assert(env, parent->mt_child == txn && + (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); + mdbx_assert(env, mdbx_dirtylist_check(txn)); + + if (txn->tw.dirtylist->length == 0 && !(txn->mt_flags & MDBX_TXN_DIRTY) && + parent->mt_numdbs == txn->mt_numdbs) { + for (int i = txn->mt_numdbs; --i >= 0;) { + mdbx_tassert(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); + if ((txn->mt_dbistate[i] & DBI_STALE) && + !(parent->mt_dbistate[i] & DBI_STALE)) + mdbx_tassert(txn, memcmp(&parent->mt_dbs[i], &txn->mt_dbs[i], + sizeof(MDBX_db)) == 0); + } + + mdbx_tassert(txn, memcmp(&parent->mt_geo, &txn->mt_geo, + sizeof(parent->mt_geo)) == 0); + mdbx_tassert(txn, memcmp(&parent->mt_canary, &txn->mt_canary, + sizeof(parent->mt_canary)) == 0); + mdbx_tassert(txn, !txn->tw.spill_pages || + MDBX_PNL_SIZE(txn->tw.spill_pages) == 0); + mdbx_tassert(txn, txn->tw.loose_count == 0); + + /* fast completion of pure nested transaction */ + end_mode = MDBX_END_PURE_COMMIT | MDBX_END_SLOT | MDBX_END_FREE; + goto done; + } + + /* Preserve space for spill list to avoid parent's state corruption + * if allocation fails. */ + const unsigned parent_retired_len = + (unsigned)(uintptr_t)parent->tw.retired_pages; + mdbx_tassert(txn, + parent_retired_len <= MDBX_PNL_SIZE(txn->tw.retired_pages)); + const unsigned retired_delta = + MDBX_PNL_SIZE(txn->tw.retired_pages) - parent_retired_len; + if (retired_delta) { + rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, retired_delta); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + + if (txn->tw.spill_pages) { + if (parent->tw.spill_pages) { + rc = mdbx_pnl_need(&parent->tw.spill_pages, + MDBX_PNL_SIZE(txn->tw.spill_pages)); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + mdbx_spill_purge(txn); + } + + if (unlikely(txn->tw.dirtylist->length + parent->tw.dirtylist->length > + parent->tw.dirtylist->detent && + !mdbx_dpl_reserve(parent, txn->tw.dirtylist->length + + parent->tw.dirtylist->length))) { + rc = MDBX_ENOMEM; + goto fail; + } + + //------------------------------------------------------------------------- + + parent->tw.lifo_reclaimed = txn->tw.lifo_reclaimed; + txn->tw.lifo_reclaimed = NULL; + + parent->tw.retired_pages = txn->tw.retired_pages; + txn->tw.retired_pages = NULL; + + mdbx_pnl_free(parent->tw.reclaimed_pglist); + parent->tw.reclaimed_pglist = txn->tw.reclaimed_pglist; + txn->tw.reclaimed_pglist = NULL; + parent->tw.last_reclaimed = txn->tw.last_reclaimed; + + parent->mt_geo = txn->mt_geo; + parent->mt_canary = txn->mt_canary; + parent->mt_flags |= txn->mt_flags & MDBX_TXN_DIRTY; + + /* Move loose pages to parent */ +#if MDBX_ENABLE_REFUND + parent->tw.loose_refund_wl = txn->tw.loose_refund_wl; +#endif /* MDBX_ENABLE_REFUND */ + parent->tw.loose_count = txn->tw.loose_count; + parent->tw.loose_pages = txn->tw.loose_pages; + + /* Merge our cursors into parent's and close them */ + mdbx_cursors_eot(txn, true); + end_mode |= MDBX_END_EOTDONE; + + /* Update parent's DBs array */ + memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); + parent->mt_numdbs = txn->mt_numdbs; + parent->mt_dbistate[FREE_DBI] = txn->mt_dbistate[FREE_DBI]; + parent->mt_dbistate[MAIN_DBI] = txn->mt_dbistate[MAIN_DBI]; + for (unsigned i = CORE_DBS; i < txn->mt_numdbs; i++) { + /* preserve parent's status */ + const uint8_t state = + txn->mt_dbistate[i] | + (parent->mt_dbistate[i] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)); + mdbx_debug("db %u dbi-state %s 0x%02x -> 0x%02x", i, + (parent->mt_dbistate[i] != state) ? "update" : "still", + parent->mt_dbistate[i], state); + parent->mt_dbistate[i] = state; + } + + ts_1 = latency ? mdbx_osal_monotime() : 0; + mdbx_txn_merge(parent, txn, parent_retired_len); + ts_2 = latency ? mdbx_osal_monotime() : 0; + env->me_txn = parent; + parent->mt_child = NULL; + mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + +#if MDBX_ENABLE_REFUND + mdbx_refund(parent); + if (mdbx_assert_enabled()) { + /* Check parent's loose pages not suitable for refund */ + for (MDBX_page *lp = parent->tw.loose_pages; lp; lp = lp->mp_next) + mdbx_tassert(parent, lp->mp_pgno < parent->tw.loose_refund_wl && + lp->mp_pgno + 1 < parent->mt_next_pgno); + /* Check parent's reclaimed pages not suitable for refund */ + if (MDBX_PNL_SIZE(parent->tw.reclaimed_pglist)) + mdbx_tassert(parent, MDBX_PNL_MOST(parent->tw.reclaimed_pglist) + 1 < + parent->mt_next_pgno); + } +#endif /* MDBX_ENABLE_REFUND */ + + ts_4 = ts_3 = latency ? mdbx_osal_monotime() : 0; + txn->mt_signature = 0; + mdbx_free(txn); + mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0); + rc = MDBX_SUCCESS; + goto provide_latency; + } + + mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + mdbx_cursors_eot(txn, false); + end_mode |= MDBX_END_EOTDONE; + + if (txn->tw.dirtylist->length == 0 && + (txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) { + for (int i = txn->mt_numdbs; --i >= 0;) + mdbx_tassert(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); + rc = MDBX_SUCCESS; + goto done; + } + + mdbx_debug("committing txn %" PRIaTXN " %p on mdbenv %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + txn->mt_txnid, (void *)txn, (void *)env, + txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); + + /* Update DB root pointers */ + if (txn->mt_numdbs > CORE_DBS) { + MDBX_cursor_couple couple; + MDBX_val data; + data.iov_len = sizeof(MDBX_db); + + rc = mdbx_cursor_init(&couple.outer, txn, MAIN_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + for (MDBX_dbi i = CORE_DBS; i < txn->mt_numdbs; i++) { + if (txn->mt_dbistate[i] & DBI_DIRTY) { + if (unlikely(TXN_DBI_CHANGED(txn, i))) { + rc = MDBX_BAD_DBI; + goto fail; + } + MDBX_db *db = &txn->mt_dbs[i]; + mdbx_debug("update main's entry for sub-db %u, mod_txnid %" PRIaTXN + " -> %" PRIaTXN, + i, db->md_mod_txnid, txn->mt_txnid); + db->md_mod_txnid = txn->mt_txnid; + data.iov_base = db; + WITH_CURSOR_TRACKING(couple.outer, + rc = mdbx_cursor_put(&couple.outer, + &txn->mt_dbxs[i].md_name, + &data, F_SUBDATA)); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + } + } + + ts_1 = latency ? mdbx_osal_monotime() : 0; + rc = mdbx_update_gc(txn); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + + ts_2 = latency ? mdbx_osal_monotime() : 0; + if (mdbx_audit_enabled()) { + rc = mdbx_audit_ex(txn, MDBX_PNL_SIZE(txn->tw.retired_pages), true); + const uint64_t audit_end = mdbx_osal_monotime(); + audit_duration = mdbx_osal_monotime_to_16dot16(audit_end - ts_2); + ts_2 = audit_end; + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + + struct mdbx_iov_ctx ctx; + mdbx_iov_init(txn, &ctx); + rc = mdbx_txn_write(txn, &ctx); + if (likely(rc == MDBX_SUCCESS)) + mdbx_iov_done(txn, &ctx); + /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */ + ts_3 = latency ? mdbx_osal_monotime() : 0; + + if (likely(rc == MDBX_SUCCESS)) { + if (txn->mt_dbs[MAIN_DBI].md_flags & DBI_DIRTY) + txn->mt_dbs[MAIN_DBI].md_mod_txnid = txn->mt_txnid; + txn->mt_dbs[FREE_DBI].md_mod_txnid = txn->mt_txnid; + + MDBX_meta meta, *head = mdbx_meta_head(env); + memcpy(meta.mm_magic_and_version, head->mm_magic_and_version, 8); + meta.mm_extra_flags = head->mm_extra_flags; + meta.mm_validator_id = head->mm_validator_id; + meta.mm_extra_pagehdr = head->mm_extra_pagehdr; + unaligned_poke_u64(4, meta.mm_pages_retired, + unaligned_peek_u64(4, head->mm_pages_retired) + + MDBX_PNL_SIZE(txn->tw.retired_pages)); + + meta.mm_geo = txn->mt_geo; + meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; + meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + meta.mm_canary = txn->mt_canary; + mdbx_meta_set_txnid(env, &meta, txn->mt_txnid); + + rc = mdbx_sync_locked( + env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, &meta); + } + ts_4 = latency ? mdbx_osal_monotime() : 0; + if (unlikely(rc != MDBX_SUCCESS)) { + env->me_flags |= MDBX_FATAL_ERROR; + goto fail; + } + + end_mode = MDBX_END_COMMITTED | MDBX_END_UPDATE | MDBX_END_EOTDONE; + +done: + rc = mdbx_txn_end(txn, end_mode); + +provide_latency: + if (latency) { + latency->audit = audit_duration; + latency->preparation = + ts_1 ? mdbx_osal_monotime_to_16dot16(ts_1 - ts_0) : 0; + latency->gc = + (ts_1 && ts_2) ? mdbx_osal_monotime_to_16dot16(ts_2 - ts_1) : 0; + latency->write = + (ts_2 && ts_3) ? mdbx_osal_monotime_to_16dot16(ts_3 - ts_2) : 0; + latency->sync = + (ts_3 && ts_4) ? mdbx_osal_monotime_to_16dot16(ts_4 - ts_3) : 0; + const uint64_t ts_5 = mdbx_osal_monotime(); + latency->ending = ts_4 ? mdbx_osal_monotime_to_16dot16(ts_5 - ts_4) : 0; + latency->whole = mdbx_osal_monotime_to_16dot16(ts_5 - ts_0); + } + return rc; + +fail: + mdbx_txn_abort(txn); + goto provide_latency; +} + +static __cold int +mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, uint64_t *filesize, + const MDBX_page *const page, const unsigned meta_number, + MDBX_meta *dest, const unsigned guess_pagesize) { + const uint64_t magic_and_version = + unaligned_peek_u64(4, &meta->mm_magic_and_version); + if (magic_and_version != MDBX_DATA_MAGIC && + magic_and_version != MDBX_DATA_MAGIC_DEVEL) { + mdbx_error("meta[%u] has invalid magic/version %" PRIx64, meta_number, + magic_and_version); + return ((magic_and_version >> 8) != MDBX_MAGIC) ? MDBX_INVALID + : MDBX_VERSION_MISMATCH; + } + + if (page->mp_pgno != meta_number) { + mdbx_error("meta[%u] has invalid pageno %" PRIaPGNO, meta_number, + page->mp_pgno); + return MDBX_INVALID; + } + + if (page->mp_flags != P_META) { + mdbx_error("page #%u not a meta-page", meta_number); + return MDBX_INVALID; + } + + /* LY: check pagesize */ + if (!is_powerof2(meta->mm_psize) || meta->mm_psize < MIN_PAGESIZE || + meta->mm_psize > MAX_PAGESIZE) { + mdbx_warning("meta[%u] has invalid pagesize (%u), skip it", meta_number, + meta->mm_psize); + return is_powerof2(meta->mm_psize) ? MDBX_VERSION_MISMATCH : MDBX_INVALID; + } + + if (dest && meta_number == 0 && guess_pagesize != meta->mm_psize) { + dest->mm_psize = meta->mm_psize; + mdbx_verbose("meta[%u] took pagesize %u", meta_number, meta->mm_psize); + } + + if (unaligned_peek_u64(4, &meta->mm_txnid_a) != + unaligned_peek_u64(4, &meta->mm_txnid_b)) { + mdbx_warning("meta[%u] not completely updated, skip it", meta_number); + return MDBX_RESULT_TRUE; + } + + /* LY: check signature as a checksum */ + if (META_IS_STEADY(meta) && + unaligned_peek_u64(4, &meta->mm_datasync_sign) != mdbx_meta_sign(meta)) { + mdbx_warning("meta[%u] has invalid steady-checksum (0x%" PRIx64 + " != 0x%" PRIx64 "), skip it", + meta_number, unaligned_peek_u64(4, &meta->mm_datasync_sign), + mdbx_meta_sign(meta)); + return MDBX_RESULT_TRUE; + } + + mdbx_debug("read meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO + ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO + " +%u -%u, txn_id %" PRIaTXN ", %s", + page->mp_pgno, meta->mm_dbs[MAIN_DBI].md_root, + meta->mm_dbs[FREE_DBI].md_root, meta->mm_geo.lower, + meta->mm_geo.next, meta->mm_geo.now, meta->mm_geo.upper, + pv2pages(meta->mm_geo.grow_pv), pv2pages(meta->mm_geo.shrink_pv), + unaligned_peek_u64(4, meta->mm_txnid_a), mdbx_durable_str(meta)); + + /* LY: check min-pages value */ + if (meta->mm_geo.lower < MIN_PAGENO || meta->mm_geo.lower > MAX_PAGENO) { + mdbx_warning("meta[%u] has invalid min-pages (%" PRIaPGNO "), skip it", + meta_number, meta->mm_geo.lower); + return MDBX_INVALID; + } + + /* LY: check max-pages value */ + if (meta->mm_geo.upper < MIN_PAGENO || meta->mm_geo.upper > MAX_PAGENO || + meta->mm_geo.upper < meta->mm_geo.lower) { + mdbx_warning("meta[%u] has invalid max-pages (%" PRIaPGNO "), skip it", + meta_number, meta->mm_geo.upper); + return MDBX_INVALID; + } + + /* LY: check last_pgno */ + if (meta->mm_geo.next < MIN_PAGENO || meta->mm_geo.next - 1 > MAX_PAGENO) { + mdbx_warning("meta[%u] has invalid next-pageno (%" PRIaPGNO "), skip it", + meta_number, meta->mm_geo.next); + return MDBX_CORRUPTED; + } + + /* LY: check filesize & used_bytes */ + const uint64_t used_bytes = meta->mm_geo.next * (uint64_t)meta->mm_psize; + if (used_bytes > *filesize) { + /* Here could be a race with DB-shrinking performed by other process */ + int err = mdbx_filesize(env->me_lazy_fd, filesize); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (used_bytes > *filesize) { + mdbx_warning("meta[%u] used-bytes (%" PRIu64 ") beyond filesize (%" PRIu64 + "), skip it", + meta_number, used_bytes, *filesize); + return MDBX_CORRUPTED; + } + } + if (meta->mm_geo.next - 1 > MAX_PAGENO || used_bytes > MAX_MAPSIZE) { + mdbx_warning("meta[%u] has too large used-space (%" PRIu64 "), skip it", + meta_number, used_bytes); + return MDBX_TOO_LARGE; + } + + /* LY: check mapsize limits */ + const uint64_t mapsize_min = meta->mm_geo.lower * (uint64_t)meta->mm_psize; + STATIC_ASSERT(MAX_MAPSIZE < PTRDIFF_MAX - MAX_PAGESIZE); + STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE); + if (mapsize_min < MIN_MAPSIZE || mapsize_min > MAX_MAPSIZE) { + if (MAX_MAPSIZE != MAX_MAPSIZE64 && mapsize_min > MAX_MAPSIZE && + mapsize_min <= MAX_MAPSIZE64) { + mdbx_assert(env, meta->mm_geo.next - 1 <= MAX_PAGENO && + used_bytes <= MAX_MAPSIZE); + mdbx_warning("meta[%u] has too large min-mapsize (%" PRIu64 "), " + "but size of used space still acceptable (%" PRIu64 ")", + meta_number, mapsize_min, used_bytes); + meta->mm_geo.lower = (pgno_t)(MAX_MAPSIZE / meta->mm_psize); + } else { + mdbx_warning("meta[%u] has invalid min-mapsize (%" PRIu64 "), skip it", + meta_number, mapsize_min); + return MDBX_VERSION_MISMATCH; + } + } + + const uint64_t mapsize_max = meta->mm_geo.upper * (uint64_t)meta->mm_psize; + STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE); + if (mapsize_max > MAX_MAPSIZE || + MAX_PAGENO < ceil_powerof2((size_t)mapsize_max, env->me_os_psize) / + (size_t)meta->mm_psize) { + /* allow to open large DB from a 32-bit environment */ + mdbx_assert(env, meta->mm_geo.next - 1 <= MAX_PAGENO && + used_bytes <= MAX_MAPSIZE); + mdbx_warning("meta[%u] has too large max-mapsize (%" PRIu64 "), " + "but size of used space still acceptable (%" PRIu64 ")", + meta_number, mapsize_max, used_bytes); + meta->mm_geo.upper = (pgno_t)(MAX_MAPSIZE / meta->mm_psize); + } + + /* LY: check and silently put mm_geo.now into [geo.lower...geo.upper]. + * + * Copy-with-compaction by previous version of libmdbx could produce DB-file + * less than meta.geo.lower bound, in case actual filling is low or no data + * at all. This is not a problem as there is no damage or loss of data. + * Therefore it is better not to consider such situation as an error, but + * silently correct it. */ + if (meta->mm_geo.now < meta->mm_geo.lower) + meta->mm_geo.now = meta->mm_geo.lower; + if (meta->mm_geo.now > meta->mm_geo.upper && + meta->mm_geo.next <= meta->mm_geo.upper) + meta->mm_geo.now = meta->mm_geo.upper; + + if (meta->mm_geo.next > meta->mm_geo.now) { + mdbx_warning("meta[%u] next-pageno (%" PRIaPGNO + ") is beyond end-pgno (%" PRIaPGNO "), skip it", + meta_number, meta->mm_geo.next, meta->mm_geo.now); + return MDBX_CORRUPTED; + } + + /* LY: GC root */ + if (meta->mm_dbs[FREE_DBI].md_root == P_INVALID) { + if (meta->mm_dbs[FREE_DBI].md_branch_pages || + meta->mm_dbs[FREE_DBI].md_depth || meta->mm_dbs[FREE_DBI].md_entries || + meta->mm_dbs[FREE_DBI].md_leaf_pages || + meta->mm_dbs[FREE_DBI].md_overflow_pages) { + mdbx_warning("meta[%u] has false-empty GC, skip it", meta_number); + return MDBX_CORRUPTED; + } + } else if (meta->mm_dbs[FREE_DBI].md_root >= meta->mm_geo.next) { + mdbx_warning("meta[%u] has invalid GC-root %" PRIaPGNO ", skip it", + meta_number, meta->mm_dbs[FREE_DBI].md_root); + return MDBX_CORRUPTED; + } + + /* LY: MainDB root */ + if (meta->mm_dbs[MAIN_DBI].md_root == P_INVALID) { + if (meta->mm_dbs[MAIN_DBI].md_branch_pages || + meta->mm_dbs[MAIN_DBI].md_depth || meta->mm_dbs[MAIN_DBI].md_entries || + meta->mm_dbs[MAIN_DBI].md_leaf_pages || + meta->mm_dbs[MAIN_DBI].md_overflow_pages) { + mdbx_warning("meta[%u] has false-empty maindb", meta_number); + return MDBX_CORRUPTED; + } + } else if (meta->mm_dbs[MAIN_DBI].md_root >= meta->mm_geo.next) { + mdbx_warning("meta[%u] has invalid maindb-root %" PRIaPGNO ", skip it", + meta_number, meta->mm_dbs[MAIN_DBI].md_root); + return MDBX_CORRUPTED; + } + + if (unaligned_peek_u64(4, &meta->mm_txnid_a) == 0) { + mdbx_warning("meta[%u] has zero txnid, skip it", meta_number); + return MDBX_RESULT_TRUE; + } + + return MDBX_SUCCESS; +} + +/* Read the environment parameters of a DB environment + * before mapping it into memory. */ +static __cold int mdbx_read_header(MDBX_env *env, MDBX_meta *dest, + uint64_t *filesize, const int lck_exclusive, + const mdbx_mode_t mode_bits) { + int rc = mdbx_filesize(env->me_lazy_fd, filesize); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + memset(dest, 0, sizeof(MDBX_meta)); + unaligned_poke_u64(4, dest->mm_datasync_sign, MDBX_DATASIGN_WEAK); + rc = MDBX_CORRUPTED; + + /* Read twice all meta pages so we can find the latest one. */ + unsigned loop_limit = NUM_METAS * 2; + for (unsigned loop_count = 0; loop_count < loop_limit; ++loop_count) { + /* We don't know the page size on first time. + * So, just guess it. */ + unsigned guess_pagesize = dest->mm_psize; + if (guess_pagesize == 0) + guess_pagesize = + (loop_count > NUM_METAS) ? env->me_psize : env->me_os_psize; + + const unsigned meta_number = loop_count % NUM_METAS; + const unsigned offset = guess_pagesize * meta_number; + + char buffer[MIN_PAGESIZE]; + unsigned retryleft = 42; + while (1) { + mdbx_trace("reading meta[%d]: offset %u, bytes %u, retry-left %u", + meta_number, offset, MIN_PAGESIZE, retryleft); + int err = mdbx_pread(env->me_lazy_fd, buffer, MIN_PAGESIZE, offset); + if (err != MDBX_SUCCESS) { + if (err == MDBX_ENODATA && offset == 0 && loop_count == 0 && + *filesize == 0 && mode_bits /* non-zero for DB creation */ != 0) + mdbx_notice("read meta: empty file (%d, %s)", err, + mdbx_strerror(err)); + else + mdbx_error("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, + mdbx_strerror(err)); + return err; + } + + char again[MIN_PAGESIZE]; + err = mdbx_pread(env->me_lazy_fd, again, MIN_PAGESIZE, offset); + if (err != MDBX_SUCCESS) { + mdbx_error("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, + mdbx_strerror(err)); + return err; + } + + if (memcmp(buffer, again, MIN_PAGESIZE) == 0 || --retryleft == 0) + break; + + mdbx_verbose("meta[%u] was updated, re-read it", meta_number); + } + + if (!retryleft) { + mdbx_error("meta[%u] is too volatile, skip it", meta_number); + continue; + } + + MDBX_page *const page = (MDBX_page *)buffer; + MDBX_meta *const meta = page_meta(page); + rc = mdbx_validate_meta(env, meta, filesize, page, meta_number, dest, + guess_pagesize); + if (rc != MDBX_SUCCESS) + continue; + + if ((env->me_stuck_meta < 0) + ? mdbx_meta_ot(meta_bootid_match(meta) ? prefer_last + : prefer_steady, + env, dest, meta) + : (meta_number == (unsigned)env->me_stuck_meta)) { + *dest = *meta; + if (!lck_exclusive && !META_IS_STEADY(dest)) + loop_limit += 1; /* LY: should re-read to hush race with update */ + mdbx_verbose("latch meta[%u]", meta_number); + } + } + + if (dest->mm_psize == 0 || + ((env->me_stuck_meta < 0) + ? (!META_IS_STEADY(dest) && + !meta_weak_acceptable(env, dest, lck_exclusive)) + : false)) { + mdbx_error("%s", "no usable meta-pages, database is corrupted"); + if (rc == MDBX_SUCCESS) { + /* TODO: try to restore the database by fully checking b-tree structure + * for the each meta page, if the corresponding option was given */ + return MDBX_CORRUPTED; + } + return rc; + } + + return MDBX_SUCCESS; +} + +static MDBX_page *__cold mdbx_meta_model(const MDBX_env *env, MDBX_page *model, + unsigned num) { + + mdbx_ensure(env, is_powerof2(env->me_psize)); + mdbx_ensure(env, env->me_psize >= MIN_PAGESIZE); + mdbx_ensure(env, env->me_psize <= MAX_PAGESIZE); + mdbx_ensure(env, env->me_dbgeo.lower >= MIN_MAPSIZE); + mdbx_ensure(env, env->me_dbgeo.upper <= MAX_MAPSIZE); + mdbx_ensure(env, env->me_dbgeo.now >= env->me_dbgeo.lower); + mdbx_ensure(env, env->me_dbgeo.now <= env->me_dbgeo.upper); + + memset(model, 0, sizeof(*model)); + model->mp_pgno = num; + model->mp_flags = P_META; + MDBX_meta *const model_meta = page_meta(model); + unaligned_poke_u64(4, model_meta->mm_magic_and_version, MDBX_DATA_MAGIC); + + model_meta->mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower); + model_meta->mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper); + model_meta->mm_geo.grow_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.grow)); + model_meta->mm_geo.shrink_pv = + pages2pv(bytes2pgno(env, env->me_dbgeo.shrink)); + model_meta->mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); + model_meta->mm_geo.next = NUM_METAS; + + mdbx_ensure(env, model_meta->mm_geo.lower >= MIN_PAGENO); + mdbx_ensure(env, model_meta->mm_geo.upper <= MAX_PAGENO); + mdbx_ensure(env, model_meta->mm_geo.now >= model_meta->mm_geo.lower); + mdbx_ensure(env, model_meta->mm_geo.now <= model_meta->mm_geo.upper); + mdbx_ensure(env, model_meta->mm_geo.next >= MIN_PAGENO); + mdbx_ensure(env, model_meta->mm_geo.next <= model_meta->mm_geo.now); + mdbx_ensure(env, model_meta->mm_geo.grow_pv == + pages2pv(pv2pages(model_meta->mm_geo.grow_pv))); + mdbx_ensure(env, model_meta->mm_geo.shrink_pv == + pages2pv(pv2pages(model_meta->mm_geo.shrink_pv))); + + model_meta->mm_psize = env->me_psize; + model_meta->mm_flags = (uint16_t)env->me_flags; + model_meta->mm_flags |= + MDBX_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ + model_meta->mm_dbs[FREE_DBI].md_root = P_INVALID; + model_meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; + mdbx_meta_set_txnid(env, model_meta, MIN_TXNID + num); + unaligned_poke_u64(4, model_meta->mm_datasync_sign, + mdbx_meta_sign(model_meta)); + return (MDBX_page *)((uint8_t *)model + env->me_psize); +} + +/* Fill in most of the zeroed meta-pages for an empty database environment. + * Return pointer to recently (head) meta-page. */ +static MDBX_meta *__cold mdbx_init_metas(const MDBX_env *env, void *buffer) { + MDBX_page *page0 = (MDBX_page *)buffer; + MDBX_page *page1 = mdbx_meta_model(env, page0, 0); + MDBX_page *page2 = mdbx_meta_model(env, page1, 1); + mdbx_meta_model(env, page2, 2); + mdbx_assert(env, !mdbx_meta_eq(env, page_meta(page0), page_meta(page1))); + mdbx_assert(env, !mdbx_meta_eq(env, page_meta(page1), page_meta(page2))); + mdbx_assert(env, !mdbx_meta_eq(env, page_meta(page2), page_meta(page0))); + return page_meta(page2); +} + +#if MDBX_ENABLE_MADVISE && !(defined(_WIN32) || defined(_WIN64)) +static size_t mdbx_madvise_threshold(const MDBX_env *env, + const size_t largest_bytes) { + /* TODO: use options */ + const unsigned factor = 9; + const size_t threshold = (largest_bytes < (65536ul << factor)) + ? 65536 /* minimal threshold */ + : (largest_bytes > (MEGABYTE * 4 << factor)) + ? MEGABYTE * 4 /* maximal threshold */ + : largest_bytes >> factor; + return bytes_align2os_bytes(env, threshold); +} +#endif /* MDBX_ENABLE_MADVISE */ + +static int mdbx_sync_locked(MDBX_env *env, unsigned flags, + MDBX_meta *const pending) { + mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); + MDBX_meta *const meta0 = METAPAGE(env, 0); + MDBX_meta *const meta1 = METAPAGE(env, 1); + MDBX_meta *const meta2 = METAPAGE(env, 2); + MDBX_meta *const head = mdbx_meta_head(env); + int rc; + + mdbx_assert(env, mdbx_meta_eq_mask(env) == 0); + mdbx_assert(env, + pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS)); + mdbx_assert(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0); + mdbx_assert(env, pending->mm_geo.next <= pending->mm_geo.now); + + if (flags & MDBX_SAFE_NOSYNC) { + /* Check auto-sync conditions */ + const pgno_t autosync_threshold = + atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); + const uint64_t autosync_period = + atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); + if ((autosync_threshold && + atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= + autosync_threshold) || + (autosync_period && + mdbx_osal_monotime() - + atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >= + autosync_period)) + flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ + } + + pgno_t shrink = 0; + if (flags & MDBX_SHRINK_ALLOWED) { + /* LY: check conditions to discard unused pages */ + const pgno_t largest_pgno = mdbx_find_largest( + env, (head->mm_geo.next > pending->mm_geo.next) ? head->mm_geo.next + : pending->mm_geo.next); + mdbx_assert(env, largest_pgno >= NUM_METAS); +#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) + const pgno_t edge = env->me_poison_edge; + if (edge > largest_pgno) { + env->me_poison_edge = largest_pgno; + VALGRIND_MAKE_MEM_NOACCESS(env->me_map + pgno2bytes(env, largest_pgno), + pgno2bytes(env, edge - largest_pgno)); + ASAN_POISON_MEMORY_REGION(env->me_map + pgno2bytes(env, largest_pgno), + pgno2bytes(env, edge - largest_pgno)); + } +#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#if MDBX_ENABLE_MADVISE && \ + (defined(MADV_DONTNEED) || defined(POSIX_MADV_DONTNEED)) + const size_t largest_bytes = pgno2bytes(env, largest_pgno); + /* threshold to avoid unreasonable frequent madvise() calls */ + const size_t madvise_threshold = mdbx_madvise_threshold(env, largest_bytes); + const size_t discard_edge_bytes = bytes_align2os_bytes( + env, ((MDBX_RDONLY & + (env->me_lck_mmap.lck ? env->me_lck_mmap.lck->mti_envmode.weak + : env->me_flags)) + ? largest_bytes + : largest_bytes + madvise_threshold)); + const pgno_t discard_edge_pgno = bytes2pgno(env, discard_edge_bytes); + const pgno_t prev_discarded_pgno = + atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed); + if (prev_discarded_pgno >= + discard_edge_pgno + bytes2pgno(env, madvise_threshold)) { + mdbx_notice("open-MADV_%s %u..%u", "DONTNEED", prev_discarded_pgno, + largest_pgno); + atomic_store32(&env->me_lck->mti_discarded_tail, discard_edge_pgno, + mo_Relaxed); + const size_t prev_discarded_bytes = + ceil_powerof2(pgno2bytes(env, prev_discarded_pgno), env->me_os_psize); + mdbx_ensure(env, prev_discarded_bytes > discard_edge_bytes); +#if defined(MADV_DONTNEED) + int advise = MADV_DONTNEED; +#if defined(MADV_FREE) && \ + 0 /* MADV_FREE works for only anonymous vma at the moment */ + if ((env->me_flags & MDBX_WRITEMAP) && + mdbx_linux_kernel_version > 0x04050000) + advise = MADV_FREE; +#endif /* MADV_FREE */ + int err = madvise(env->me_map + discard_edge_bytes, + prev_discarded_bytes - discard_edge_bytes, advise) + ? ignore_enosys(errno) + : MDBX_SUCCESS; +#else + int err = ignore_enosys(posix_madvise( + env->me_map + discard_edge_bytes, + prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED)); +#endif + if (unlikely(MDBX_IS_ERROR(err))) + return err; + } +#endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */ + + /* LY: check conditions to shrink datafile */ + const pgno_t backlog_gap = 3 + pending->mm_dbs[FREE_DBI].md_depth * 3; + pgno_t shrink_step = 0; + if (pending->mm_geo.shrink_pv && + pending->mm_geo.now - pending->mm_geo.next > + (shrink_step = pv2pages(pending->mm_geo.shrink_pv)) + backlog_gap) { + if (pending->mm_geo.now > largest_pgno && + pending->mm_geo.now - largest_pgno > shrink_step + backlog_gap) { + pgno_t grow_step = 0; + const pgno_t aligner = + pending->mm_geo.grow_pv + ? (grow_step = pv2pages(pending->mm_geo.grow_pv)) + : shrink_step; + const pgno_t with_backlog_gap = largest_pgno + backlog_gap; + const pgno_t aligned = pgno_align2os_pgno( + env, with_backlog_gap + aligner - with_backlog_gap % aligner); + const pgno_t bottom = + (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; + if (pending->mm_geo.now > bottom) { + if (META_IS_STEADY(mdbx_meta_steady(env))) + /* force steady, but only if steady-checkpoint is present */ + flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; + shrink = pending->mm_geo.now - bottom; + pending->mm_geo.now = bottom; + if (unlikely(mdbx_meta_txnid_stable(env, head) == + unaligned_peek_u64(4, pending->mm_txnid_a))) { + const txnid_t txnid = + safe64_txnid_next(unaligned_peek_u64(4, pending->mm_txnid_a)); + if (unlikely(txnid > MAX_TXNID)) { + rc = MDBX_TXN_FULL; + mdbx_error("txnid overflow, raise %d", rc); + goto fail; + } + mdbx_meta_set_txnid(env, pending, txnid); + } + } + } + } + } + + /* LY: step#1 - sync previously written/updated data-pages */ + rc = MDBX_RESULT_FALSE /* carry steady */; + if (atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) { + mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + enum mdbx_syncmode_bits mode_bits = MDBX_SYNC_NONE; + if ((flags & MDBX_SAFE_NOSYNC) == 0) { + mode_bits = MDBX_SYNC_DATA; + if (pending->mm_geo.next > mdbx_meta_steady(env)->mm_geo.now) + mode_bits |= MDBX_SYNC_SIZE; + if (flags & MDBX_NOMETASYNC) + mode_bits |= MDBX_SYNC_IODQ; + } +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ + if (flags & MDBX_WRITEMAP) + rc = + mdbx_msync(&env->me_dxb_mmap, 0, + pgno_align2os_bytes(env, pending->mm_geo.next), mode_bits); + else + rc = mdbx_fsync(env->me_lazy_fd, mode_bits); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + rc = (flags & MDBX_SAFE_NOSYNC) ? MDBX_RESULT_TRUE /* carry non-steady */ + : MDBX_RESULT_FALSE /* carry steady */; + } + + /* Steady or Weak */ + if (rc == MDBX_RESULT_FALSE /* carry steady */) { + atomic_store64(&env->me_lck->mti_sync_timestamp, mdbx_osal_monotime(), + mo_Relaxed); + unaligned_poke_u64(4, pending->mm_datasync_sign, mdbx_meta_sign(pending)); + atomic_store32(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed); + } else { + assert(rc == MDBX_RESULT_TRUE /* carry non-steady */); + unaligned_poke_u64(4, pending->mm_datasync_sign, MDBX_DATASIGN_WEAK); + } + + MDBX_meta *target = nullptr; + if (mdbx_meta_txnid_stable(env, head) == + unaligned_peek_u64(4, pending->mm_txnid_a)) { + mdbx_assert(env, memcmp(&head->mm_dbs, &pending->mm_dbs, + sizeof(head->mm_dbs)) == 0); + mdbx_assert(env, memcmp(&head->mm_canary, &pending->mm_canary, + sizeof(head->mm_canary)) == 0); + mdbx_assert(env, memcmp(&head->mm_geo, &pending->mm_geo, + sizeof(pending->mm_geo)) == 0); + if (!META_IS_STEADY(head) && META_IS_STEADY(pending)) + target = head; + else { + mdbx_ensure(env, mdbx_meta_eq(env, head, pending)); + mdbx_debug("%s", "skip update meta"); + return MDBX_SUCCESS; + } + } else if (head == meta0) + target = mdbx_meta_ancient(prefer_steady, env, meta1, meta2); + else if (head == meta1) + target = mdbx_meta_ancient(prefer_steady, env, meta0, meta2); + else { + mdbx_assert(env, head == meta2); + target = mdbx_meta_ancient(prefer_steady, env, meta0, meta1); + } + + /* LY: step#2 - update meta-page. */ + mdbx_debug( + "writing meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO + ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO + " +%u -%u, txn_id %" PRIaTXN ", %s", + data_page(target)->mp_pgno, pending->mm_dbs[MAIN_DBI].md_root, + pending->mm_dbs[FREE_DBI].md_root, pending->mm_geo.lower, + pending->mm_geo.next, pending->mm_geo.now, pending->mm_geo.upper, + pv2pages(pending->mm_geo.grow_pv), pv2pages(pending->mm_geo.shrink_pv), + unaligned_peek_u64(4, pending->mm_txnid_a), mdbx_durable_str(pending)); + + mdbx_debug("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO + "/%" PRIaPGNO, + (meta0 == head) ? "head" : (meta0 == target) ? "tail" : "stay", + mdbx_durable_str(meta0), mdbx_meta_txnid_fluid(env, meta0), + meta0->mm_dbs[MAIN_DBI].md_root, meta0->mm_dbs[FREE_DBI].md_root); + mdbx_debug("meta1: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO + "/%" PRIaPGNO, + (meta1 == head) ? "head" : (meta1 == target) ? "tail" : "stay", + mdbx_durable_str(meta1), mdbx_meta_txnid_fluid(env, meta1), + meta1->mm_dbs[MAIN_DBI].md_root, meta1->mm_dbs[FREE_DBI].md_root); + mdbx_debug("meta2: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO + "/%" PRIaPGNO, + (meta2 == head) ? "head" : (meta2 == target) ? "tail" : "stay", + mdbx_durable_str(meta2), mdbx_meta_txnid_fluid(env, meta2), + meta2->mm_dbs[MAIN_DBI].md_root, meta2->mm_dbs[FREE_DBI].md_root); + + mdbx_assert(env, !mdbx_meta_eq(env, pending, meta0)); + mdbx_assert(env, !mdbx_meta_eq(env, pending, meta1)); + mdbx_assert(env, !mdbx_meta_eq(env, pending, meta2)); + + mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); + mdbx_ensure(env, + target == head || mdbx_meta_txnid_stable(env, target) < + unaligned_peek_u64(4, pending->mm_txnid_a)); +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ + if (flags & MDBX_WRITEMAP) { + mdbx_jitter4testing(true); + if (likely(target != head)) { + /* LY: 'invalidate' the meta. */ + mdbx_meta_update_begin(env, target, + unaligned_peek_u64(4, pending->mm_txnid_a)); + unaligned_poke_u64(4, target->mm_datasync_sign, MDBX_DATASIGN_WEAK); +#ifndef NDEBUG + /* debug: provoke failure to catch a violators, but don't touch mm_psize + * and mm_flags to allow readers catch actual pagesize. */ + uint8_t *provoke_begin = (uint8_t *)&target->mm_dbs[FREE_DBI].md_root; + uint8_t *provoke_end = (uint8_t *)&target->mm_datasync_sign; + memset(provoke_begin, 0xCC, provoke_end - provoke_begin); + mdbx_jitter4testing(false); +#endif + + /* LY: update info */ + target->mm_geo = pending->mm_geo; + target->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI]; + target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI]; + target->mm_canary = pending->mm_canary; + memcpy(target->mm_pages_retired, pending->mm_pages_retired, 8); + mdbx_jitter4testing(true); + + /* LY: 'commit' the meta */ + mdbx_meta_update_end(env, target, + unaligned_peek_u64(4, pending->mm_txnid_b)); + mdbx_jitter4testing(true); + } else { + /* dangerous case (target == head), only mm_datasync_sign could + * me updated, check assertions once again */ + mdbx_ensure(env, mdbx_meta_txnid_stable(env, head) == + unaligned_peek_u64(4, pending->mm_txnid_a) && + !META_IS_STEADY(head) && META_IS_STEADY(pending)); + mdbx_ensure(env, memcmp(&head->mm_geo, &pending->mm_geo, + sizeof(head->mm_geo)) == 0); + mdbx_ensure(env, memcmp(&head->mm_dbs, &pending->mm_dbs, + sizeof(head->mm_dbs)) == 0); + mdbx_ensure(env, memcmp(&head->mm_canary, &pending->mm_canary, + sizeof(head->mm_canary)) == 0); + } + memcpy(target->mm_datasync_sign, pending->mm_datasync_sign, 8); + mdbx_flush_incoherent_cpu_writeback(); + mdbx_jitter4testing(true); + /* sync meta-pages */ + rc = + mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + (flags & MDBX_NOMETASYNC) ? MDBX_SYNC_NONE + : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } else { + const MDBX_meta undo_meta = *target; + const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) + ? env->me_dsync_fd + : env->me_lazy_fd; +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = mdbx_pwrite(fd, pending, sizeof(MDBX_meta), + (uint8_t *)target - env->me_map); + if (unlikely(rc != MDBX_SUCCESS)) { + undo: + mdbx_debug("%s", "write failed, disk error?"); + /* On a failure, the pagecache still contains the new data. + * Try write some old data back, to prevent it from being used. */ + mdbx_pwrite(fd, &undo_meta, sizeof(MDBX_meta), + (uint8_t *)target - env->me_map); + goto fail; + } + mdbx_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); + /* sync meta-pages */ + if ((flags & MDBX_NOMETASYNC) == 0 && fd == env->me_lazy_fd) { + rc = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + if (rc != MDBX_SUCCESS) + goto undo; + } + } + if (flags & MDBX_NOMETASYNC) + env->me_lck->mti_unsynced_pages.weak += 1; + else + env->me_lck->mti_meta_sync_txnid.weak = + (uint32_t)unaligned_peek_u64(4, pending->mm_txnid_a); + + /* LY: shrink datafile if needed */ + if (unlikely(shrink)) { + mdbx_verbose("shrink to %" PRIaPGNO " pages (-%" PRIaPGNO ")", + pending->mm_geo.now, shrink); + rc = mdbx_mapresize_implicit(env, pending->mm_geo.next, pending->mm_geo.now, + pending->mm_geo.upper); + if (MDBX_IS_ERROR(rc)) + goto fail; + } + + MDBX_lockinfo *const lck = env->me_lck_mmap.lck; + if (likely(lck)) + /* toggle oldest refresh */ + atomic_store32(&lck->mti_readers_refresh_flag, false, mo_Relaxed); + + return MDBX_SUCCESS; + +fail: + env->me_flags |= MDBX_FATAL_ERROR; + return rc; +} + +static void recalculate_merge_threshold(MDBX_env *env) { + const unsigned bytes = page_space(env); + env->me_merge_threshold = (uint16_t)( + bytes - (bytes * env->me_options.merge_threshold_16dot16_percent >> 16)); + env->me_merge_threshold_gc = (uint16_t)( + bytes - ((env->me_options.merge_threshold_16dot16_percent > 19005) + ? bytes / 3 /* 33 % */ + : bytes / 4 /* 25 % */)); +} + +static void __cold mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { + STATIC_ASSERT(PTRDIFF_MAX > MAX_MAPSIZE); + STATIC_ASSERT(MIN_PAGESIZE > sizeof(MDBX_page) + sizeof(MDBX_meta)); + mdbx_ensure(env, is_powerof2(pagesize)); + mdbx_ensure(env, pagesize >= MIN_PAGESIZE); + mdbx_ensure(env, pagesize <= MAX_PAGESIZE); + env->me_psize = (unsigned)pagesize; + + STATIC_ASSERT(MAX_GC1OVPAGE(MIN_PAGESIZE) > 4); + STATIC_ASSERT(MAX_GC1OVPAGE(MAX_PAGESIZE) < MDBX_PGL_LIMIT / 4); + const intptr_t maxgc_ov1page = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; + mdbx_ensure(env, maxgc_ov1page > 42 && + maxgc_ov1page < (intptr_t)MDBX_PGL_LIMIT / 4); + env->me_maxgc_ov1page = (unsigned)maxgc_ov1page; + + STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) > sizeof(MDBX_db) + NODESIZE + 42); + STATIC_ASSERT(LEAF_NODE_MAX(MAX_PAGESIZE) < UINT16_MAX); + STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) >= BRANCH_NODE_MAX(MIN_PAGESIZE)); + STATIC_ASSERT(BRANCH_NODE_MAX(MAX_PAGESIZE) > NODESIZE + 42); + STATIC_ASSERT(BRANCH_NODE_MAX(MAX_PAGESIZE) < UINT16_MAX); + const intptr_t branch_nodemax = BRANCH_NODE_MAX(pagesize); + const intptr_t leaf_nodemax = LEAF_NODE_MAX(pagesize); + mdbx_ensure(env, + branch_nodemax > (intptr_t)(NODESIZE + 42) && + branch_nodemax % 2 == 0 && + leaf_nodemax > (intptr_t)(sizeof(MDBX_db) + NODESIZE + 42) && + leaf_nodemax >= branch_nodemax && + leaf_nodemax < (int)UINT16_MAX && leaf_nodemax % 2 == 0); + env->me_leaf_nodemax = (unsigned)leaf_nodemax; + env->me_psize2log = (uint8_t)log2n_powerof2(pagesize); + mdbx_assert(env, pgno2bytes(env, 1) == pagesize); + mdbx_assert(env, bytes2pgno(env, pagesize + pagesize) == 2); + recalculate_merge_threshold(env); + + const pgno_t max_pgno = bytes2pgno(env, MAX_MAPSIZE); + if (!env->me_options.flags.non_auto.dp_limit) { + /* auto-setup dp_limit by "The42" ;-) */ + intptr_t total_ram_pages, avail_ram_pages; + int err = mdbx_get_sysraminfo(nullptr, &total_ram_pages, &avail_ram_pages); + if (unlikely(err != MDBX_SUCCESS)) + mdbx_error("mdbx_get_sysraminfo(), rc %d", err); + else { + size_t reasonable_dpl_limit = + (size_t)(total_ram_pages + avail_ram_pages) / 42; + if (pagesize > env->me_os_psize) + reasonable_dpl_limit /= pagesize / env->me_os_psize; + else if (pagesize < env->me_os_psize) + reasonable_dpl_limit *= env->me_os_psize / pagesize; + reasonable_dpl_limit = (reasonable_dpl_limit < MDBX_PGL_LIMIT) + ? reasonable_dpl_limit + : MDBX_PGL_LIMIT; + reasonable_dpl_limit = (reasonable_dpl_limit > CURSOR_STACK * 4) + ? reasonable_dpl_limit + : CURSOR_STACK * 4; + env->me_options.dp_limit = (unsigned)reasonable_dpl_limit; + } + } + if (env->me_options.dp_limit > max_pgno - NUM_METAS) + env->me_options.dp_limit = max_pgno - NUM_METAS; + if (env->me_options.dp_initial > env->me_options.dp_limit) + env->me_options.dp_initial = env->me_options.dp_limit; +} + +static __inline MDBX_CONST_FUNCTION MDBX_lockinfo * +lckless_stub(const MDBX_env *env) { + uintptr_t stub = (uintptr_t)&env->x_lckless_stub; + /* align to avoid false-positive alarm from UndefinedBehaviorSanitizer */ + stub = (stub + MDBX_CACHELINE_SIZE - 1) & ~(MDBX_CACHELINE_SIZE - 1); + return (MDBX_lockinfo *)stub; +} + +__cold int mdbx_env_create(MDBX_env **penv) { + MDBX_env *env = mdbx_calloc(1, sizeof(MDBX_env)); + if (unlikely(!env)) + return MDBX_ENOMEM; + + env->me_maxreaders = DEFAULT_READERS; + env->me_maxdbs = env->me_numdbs = CORE_DBS; + env->me_lazy_fd = INVALID_HANDLE_VALUE; + env->me_dsync_fd = INVALID_HANDLE_VALUE; + env->me_lfd = INVALID_HANDLE_VALUE; + env->me_pid = mdbx_getpid(); + env->me_stuck_meta = -1; + + env->me_options.dp_reserve_limit = 1024; + env->me_options.rp_augment_limit = 256 * 1024; + env->me_options.dp_limit = 64 * 1024; + if (env->me_options.dp_limit > MAX_PAGENO - NUM_METAS) + env->me_options.dp_limit = MAX_PAGENO - NUM_METAS; + env->me_options.dp_initial = MDBX_PNL_INITIAL; + if (env->me_options.dp_initial > env->me_options.dp_limit) + env->me_options.dp_initial = env->me_options.dp_limit; + env->me_options.spill_max_denominator = 8; + env->me_options.spill_min_denominator = 8; + env->me_options.spill_parent4child_denominator = 0; + env->me_options.dp_loose_limit = 64; + env->me_options.merge_threshold_16dot16_percent = 65536 / 4 /* 25% */; + + int rc; + const size_t os_psize = mdbx_syspagesize(); + if (unlikely(!is_powerof2(os_psize) || os_psize < MIN_PAGESIZE)) { + mdbx_error("unsuitable system pagesize %" PRIuPTR, os_psize); + rc = MDBX_INCOMPATIBLE; + goto bailout; + } + env->me_os_psize = (unsigned)os_psize; + mdbx_setup_pagesize(env, (env->me_os_psize < MAX_PAGESIZE) ? env->me_os_psize + : MAX_PAGESIZE); + + rc = mdbx_fastmutex_init(&env->me_dbi_lock); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + +#if defined(_WIN32) || defined(_WIN64) + mdbx_srwlock_Init(&env->me_remap_guard); + InitializeCriticalSection(&env->me_windowsbug_lock); +#else + rc = mdbx_fastmutex_init(&env->me_remap_guard); + if (unlikely(rc != MDBX_SUCCESS)) { + mdbx_fastmutex_destroy(&env->me_dbi_lock); + goto bailout; + } + +#if MDBX_LOCKING > MDBX_LOCKING_SYSV + MDBX_lockinfo *const stub = lckless_stub(env); + rc = mdbx_ipclock_stub(&stub->mti_wlock); +#endif /* MDBX_LOCKING */ + if (unlikely(rc != MDBX_SUCCESS)) { + mdbx_fastmutex_destroy(&env->me_remap_guard); + mdbx_fastmutex_destroy(&env->me_dbi_lock); + goto bailout; + } +#endif /* Windows */ + + VALGRIND_CREATE_MEMPOOL(env, 0, 0); + env->me_signature.weak = MDBX_ME_SIGNATURE; + *penv = env; + return MDBX_SUCCESS; + +bailout: + mdbx_free(env); + *penv = nullptr; + return rc; +} + +__cold static intptr_t get_reasonable_db_maxsize(intptr_t *cached_result) { + if (*cached_result == 0) { + intptr_t pagesize, total_ram_pages; + if (unlikely(mdbx_get_sysraminfo(&pagesize, &total_ram_pages, nullptr) != + MDBX_SUCCESS)) + return MAX_MAPSIZE32 /* the 32-bit limit is good enough for fallback */; + + if (unlikely((size_t)total_ram_pages * 2 > MAX_MAPSIZE / (size_t)pagesize)) + return MAX_MAPSIZE; + assert(MAX_MAPSIZE >= (size_t)(total_ram_pages * pagesize * 2)); + + /* Suggesting should not be more than golden ratio of the size of RAM. */ + *cached_result = (intptr_t)((size_t)total_ram_pages * 207 >> 7) * pagesize; + + /* Round to the nearest human-readable granulation. */ + for (size_t unit = MEGABYTE; unit; unit <<= 5) { + const size_t floor = floor_powerof2(*cached_result, unit); + const size_t ceil = ceil_powerof2(*cached_result, unit); + const size_t threshold = (size_t)*cached_result >> 4; + const bool down = + *cached_result - floor < ceil - *cached_result || ceil > MAX_MAPSIZE; + if (threshold < (down ? *cached_result - floor : ceil - *cached_result)) + break; + *cached_result = down ? floor : ceil; + } + } + return *cached_result; +} + +__cold LIBMDBX_API int +mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, + intptr_t size_upper, intptr_t growth_step, + intptr_t shrink_threshold, intptr_t pagesize) { + int rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + const bool inside_txn = + (env->me_txn0 && env->me_txn0->mt_owner == mdbx_thread_self()); + +#if MDBX_DEBUG + if (growth_step < 0) + growth_step = 1; + if (shrink_threshold < 0) + shrink_threshold = 1; +#endif + + intptr_t reasonable_maxsize = 0; + bool need_unlock = false; + if (env->me_map) { + /* env already mapped */ + if (unlikely(env->me_flags & MDBX_RDONLY)) + return MDBX_EACCESS; + + if (!inside_txn) { + int err = mdbx_txn_lock(env, false); + if (unlikely(err != MDBX_SUCCESS)) + return err; + need_unlock = true; + } + MDBX_meta *head = mdbx_meta_head(env); + if (!inside_txn) { + env->me_txn0->mt_txnid = meta_txnid(env, head, false); + mdbx_find_oldest(env->me_txn0); + } + + /* get untouched params from DB */ + if (pagesize <= 0 || pagesize >= INT_MAX) + pagesize = env->me_psize; + if (size_lower < 0) + size_lower = pgno2bytes(env, head->mm_geo.lower); + if (size_now < 0) + size_now = pgno2bytes(env, head->mm_geo.now); + if (size_upper < 0) + size_upper = pgno2bytes(env, head->mm_geo.upper); + if (growth_step < 0) + growth_step = pgno2bytes(env, pv2pages(head->mm_geo.grow_pv)); + if (shrink_threshold < 0) + shrink_threshold = pgno2bytes(env, pv2pages(head->mm_geo.shrink_pv)); + + if (pagesize != (intptr_t)env->me_psize) { + rc = MDBX_EINVAL; + goto bailout; + } + const size_t usedbytes = + pgno2bytes(env, mdbx_find_largest(env, head->mm_geo.next)); + if ((size_t)size_upper < usedbytes) { + rc = MDBX_MAP_FULL; + goto bailout; + } + if ((size_t)size_now < usedbytes) + size_now = usedbytes; + } else { + /* env NOT yet mapped */ + if (unlikely(inside_txn)) + return MDBX_PANIC; + + /* is requested some auto-value for pagesize ? */ + if (pagesize >= INT_MAX /* maximal */) + pagesize = MAX_PAGESIZE; + else if (pagesize <= 0) { + if (pagesize < 0 /* default */) { + pagesize = env->me_os_psize; + if ((uintptr_t)pagesize > MAX_PAGESIZE) + pagesize = MAX_PAGESIZE; + mdbx_assert(env, (uintptr_t)pagesize >= MIN_PAGESIZE); + } else if (pagesize == 0 /* minimal */) + pagesize = MIN_PAGESIZE; + + /* choose pagesize */ + intptr_t max_size = (size_now > size_lower) ? size_now : size_lower; + max_size = (size_upper > max_size) ? size_upper : max_size; + if (max_size < 0 /* default */) + max_size = DEFAULT_MAPSIZE; + else if (max_size == 0 /* minimal */) + max_size = MIN_MAPSIZE; + else if (max_size >= (intptr_t)MAX_MAPSIZE /* maximal */) + max_size = get_reasonable_db_maxsize(&reasonable_maxsize); + + while (max_size > pagesize * (int64_t)MAX_PAGENO && + pagesize < MAX_PAGESIZE) + pagesize <<= 1; + } + } + + if (pagesize < (intptr_t)MIN_PAGESIZE || pagesize > (intptr_t)MAX_PAGESIZE || + !is_powerof2(pagesize)) { + rc = MDBX_EINVAL; + goto bailout; + } + + if (size_lower <= 0) { + size_lower = MIN_MAPSIZE; + if (MIN_MAPSIZE / pagesize < MIN_PAGENO) + size_lower = MIN_PAGENO * pagesize; + } + if (size_lower >= INTPTR_MAX) { + size_lower = get_reasonable_db_maxsize(&reasonable_maxsize); + if ((size_t)size_lower / pagesize > MAX_PAGENO) + size_lower = pagesize * MAX_PAGENO; + } + + if (size_now <= 0) { + size_now = size_lower; + if (size_upper >= size_lower && size_now > size_upper) + size_now = size_upper; + } + if (size_now >= INTPTR_MAX) { + size_now = get_reasonable_db_maxsize(&reasonable_maxsize); + if ((size_t)size_now / pagesize > MAX_PAGENO) + size_now = pagesize * MAX_PAGENO; + } + + if (size_upper <= 0) { + if (size_now >= get_reasonable_db_maxsize(&reasonable_maxsize) / 2) + size_upper = get_reasonable_db_maxsize(&reasonable_maxsize); + else if (MAX_MAPSIZE != MAX_MAPSIZE32 && + (size_t)size_now >= MAX_MAPSIZE32 / 2 && + (size_t)size_now <= MAX_MAPSIZE32 / 4 * 3) + size_upper = MAX_MAPSIZE32; + else { + size_upper = size_now + size_now; + if ((size_t)size_upper < DEFAULT_MAPSIZE * 2) + size_upper = DEFAULT_MAPSIZE * 2; + } + if ((size_t)size_upper / pagesize > MAX_PAGENO) + size_upper = pagesize * MAX_PAGENO; + } else if (size_upper >= INTPTR_MAX) { + size_upper = get_reasonable_db_maxsize(&reasonable_maxsize); + if ((size_t)size_upper / pagesize > MAX_PAGENO) + size_upper = pagesize * MAX_PAGENO; + } + + if (unlikely(size_lower < (intptr_t)MIN_MAPSIZE || size_lower > size_upper)) { + rc = MDBX_EINVAL; + goto bailout; + } + + if ((uint64_t)size_lower / pagesize < MIN_PAGENO) { + rc = MDBX_EINVAL; + goto bailout; + } + + if (unlikely((size_t)size_upper > MAX_MAPSIZE || + (uint64_t)size_upper / pagesize > MAX_PAGENO)) { + rc = MDBX_TOO_LARGE; + goto bailout; + } + + const size_t unit = (env->me_os_psize > (size_t)pagesize) ? env->me_os_psize + : (size_t)pagesize; + size_lower = ceil_powerof2(size_lower, unit); + size_upper = ceil_powerof2(size_upper, unit); + size_now = ceil_powerof2(size_now, unit); + + /* LY: подбираем значение size_upper: + * - кратное размеру страницы + * - без нарушения MAX_MAPSIZE и MAX_PAGENO */ + while (unlikely((size_t)size_upper > MAX_MAPSIZE || + (uint64_t)size_upper / pagesize > MAX_PAGENO)) { + if ((size_t)size_upper < unit + MIN_MAPSIZE || + (size_t)size_upper < (size_t)pagesize * (MIN_PAGENO + 1)) { + /* паранойа на случай переполнения при невероятных значениях */ + rc = MDBX_EINVAL; + goto bailout; + } + size_upper -= unit; + if ((size_t)size_upper < (size_t)size_lower) + size_lower = size_upper; + } + mdbx_assert(env, (size_upper - size_lower) % env->me_os_psize == 0); + + if (size_now < size_lower) + size_now = size_lower; + if (size_now > size_upper) + size_now = size_upper; + + if (growth_step < 0) { + growth_step = ((size_t)(size_upper - size_lower)) / 42; + if (growth_step > size_lower && size_lower < (intptr_t)MEGABYTE) + growth_step = size_lower; + if (growth_step < 65536) + growth_step = 65536; + if ((size_t)growth_step > MAX_MAPSIZE / 64) + growth_step = MAX_MAPSIZE / 64; + } + if (growth_step == 0 && shrink_threshold > 0) + growth_step = 1; + growth_step = ceil_powerof2(growth_step, unit); + + if (shrink_threshold < 0) + shrink_threshold = growth_step + growth_step; + shrink_threshold = ceil_powerof2(shrink_threshold, unit); + + //---------------------------------------------------------------------------- + + if (!env->me_map) { + /* save user's geo-params for future open/create */ + if (pagesize != (intptr_t)env->me_psize) + mdbx_setup_pagesize(env, pagesize); + env->me_dbgeo.lower = size_lower; + env->me_dbgeo.now = size_now; + env->me_dbgeo.upper = size_upper; + env->me_dbgeo.grow = + pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, growth_step)))); + env->me_dbgeo.shrink = + pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, shrink_threshold)))); + + mdbx_ensure(env, env->me_dbgeo.lower >= MIN_MAPSIZE); + mdbx_ensure(env, env->me_dbgeo.lower / (unsigned)pagesize >= MIN_PAGENO); + mdbx_ensure(env, env->me_dbgeo.lower % (unsigned)pagesize == 0); + mdbx_ensure(env, env->me_dbgeo.lower % env->me_os_psize == 0); + + mdbx_ensure(env, env->me_dbgeo.upper <= MAX_MAPSIZE); + mdbx_ensure(env, env->me_dbgeo.upper / (unsigned)pagesize <= MAX_PAGENO); + mdbx_ensure(env, env->me_dbgeo.upper % (unsigned)pagesize == 0); + mdbx_ensure(env, env->me_dbgeo.upper % env->me_os_psize == 0); + + mdbx_ensure(env, env->me_dbgeo.now >= env->me_dbgeo.lower); + mdbx_ensure(env, env->me_dbgeo.now <= env->me_dbgeo.upper); + mdbx_ensure(env, env->me_dbgeo.now % (unsigned)pagesize == 0); + mdbx_ensure(env, env->me_dbgeo.now % env->me_os_psize == 0); + + mdbx_ensure(env, env->me_dbgeo.grow % (unsigned)pagesize == 0); + mdbx_ensure(env, env->me_dbgeo.grow % env->me_os_psize == 0); + mdbx_ensure(env, env->me_dbgeo.shrink % (unsigned)pagesize == 0); + mdbx_ensure(env, env->me_dbgeo.shrink % env->me_os_psize == 0); + + rc = MDBX_SUCCESS; + } else { + /* apply new params to opened environment */ + mdbx_ensure(env, pagesize == (intptr_t)env->me_psize); + MDBX_meta meta; + MDBX_meta *head = nullptr; + const MDBX_geo *current_geo; + if (inside_txn) { + current_geo = &env->me_txn->mt_geo; + } else { + head = mdbx_meta_head(env); + meta = *head; + current_geo = &meta.mm_geo; + } + + MDBX_geo new_geo; + new_geo.lower = bytes2pgno(env, size_lower); + new_geo.now = bytes2pgno(env, size_now); + new_geo.upper = bytes2pgno(env, size_upper); + new_geo.grow_pv = pages2pv(bytes2pgno(env, growth_step)); + new_geo.shrink_pv = pages2pv(bytes2pgno(env, shrink_threshold)); + new_geo.next = current_geo->next; + + mdbx_ensure(env, + pgno_align2os_bytes(env, new_geo.lower) == (size_t)size_lower); + mdbx_ensure(env, + pgno_align2os_bytes(env, new_geo.upper) == (size_t)size_upper); + mdbx_ensure(env, pgno_align2os_bytes(env, new_geo.now) == (size_t)size_now); + mdbx_ensure(env, new_geo.grow_pv == pages2pv(pv2pages(new_geo.grow_pv))); + mdbx_ensure(env, + new_geo.shrink_pv == pages2pv(pv2pages(new_geo.shrink_pv))); + + mdbx_ensure(env, (size_t)size_lower >= MIN_MAPSIZE); + mdbx_ensure(env, new_geo.lower >= MIN_PAGENO); + mdbx_ensure(env, (size_t)size_upper <= MAX_MAPSIZE); + mdbx_ensure(env, new_geo.upper <= MAX_PAGENO); + mdbx_ensure(env, new_geo.now >= new_geo.next); + mdbx_ensure(env, new_geo.upper >= new_geo.now); + mdbx_ensure(env, new_geo.now >= new_geo.lower); + + if (memcmp(current_geo, &new_geo, sizeof(MDBX_geo)) != 0) { +#if defined(_WIN32) || defined(_WIN64) + /* Was DB shrinking disabled before and now it will be enabled? */ + if (new_geo.lower < new_geo.upper && new_geo.shrink_pv && + !(current_geo->lower < current_geo->upper && + current_geo->shrink_pv)) { + if (!env->me_lck_mmap.lck) { + rc = MDBX_EPERM; + goto bailout; + } + int err = mdbx_rdt_lock(env); + if (unlikely(MDBX_IS_ERROR(err))) { + rc = err; + goto bailout; + } + + /* Check if there are any reading threads that do not use the SRWL */ + const size_t CurrentTid = GetCurrentThreadId(); + const MDBX_reader *const begin = env->me_lck_mmap.lck->mti_readers; + const MDBX_reader *const end = + begin + atomic_load32(&env->me_lck_mmap.lck->mti_numreaders, + mo_AcquireRelease); + for (const MDBX_reader *reader = begin; reader < end; ++reader) { + if (reader->mr_pid.weak == env->me_pid && reader->mr_tid.weak && + reader->mr_tid.weak != CurrentTid) { + /* At least one thread may don't use SRWL */ + rc = MDBX_EPERM; + break; + } + } + + mdbx_rdt_unlock(env); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } +#endif + + if (new_geo.now != current_geo->now || + new_geo.upper != current_geo->upper) { + rc = mdbx_mapresize(env, current_geo->next, new_geo.now, new_geo.upper, + false); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + mdbx_assert(env, (head == nullptr) == inside_txn); + if (head) + head = /* base address could be changed */ mdbx_meta_head(env); + } + if (inside_txn) { + env->me_txn->mt_geo = new_geo; + env->me_txn->mt_flags |= MDBX_TXN_DIRTY; + } else { + meta.mm_geo = new_geo; + const txnid_t txnid = + safe64_txnid_next(mdbx_meta_txnid_stable(env, head)); + if (unlikely(txnid > MAX_TXNID)) { + rc = MDBX_TXN_FULL; + mdbx_error("txnid overflow, raise %d", rc); + } else { + mdbx_meta_set_txnid(env, &meta, txnid); + rc = mdbx_sync_locked(env, env->me_flags, &meta); + } + } + + if (likely(rc == MDBX_SUCCESS)) { + /* store new geo to env to avoid influences */ + env->me_dbgeo.now = pgno2bytes(env, new_geo.now); + env->me_dbgeo.lower = pgno2bytes(env, new_geo.lower); + env->me_dbgeo.upper = pgno2bytes(env, new_geo.upper); + env->me_dbgeo.grow = pgno2bytes(env, pv2pages(new_geo.grow_pv)); + env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(new_geo.shrink_pv)); + } + } + } + +bailout: + if (need_unlock) + mdbx_txn_unlock(env); + return rc; +} + +#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API +__cold int mdbx_env_set_mapsize(MDBX_env *env, size_t size) { + return __inline_mdbx_env_set_mapsize(env, size); +} + +__cold int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) { + return __inline_mdbx_env_set_maxdbs(env, dbs); +} + +__cold int mdbx_env_get_maxdbs(const MDBX_env *env, MDBX_dbi *dbs) { + return __inline_mdbx_env_get_maxdbs(env, dbs); +} + +__cold int mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers) { + return __inline_mdbx_env_set_maxreaders(env, readers); +} + +__cold int mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers) { + return __inline_mdbx_env_get_maxreaders(env, readers); +} +#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ + +/* Further setup required for opening an MDBX environment */ +static __cold int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, + const mdbx_mode_t mode_bits) { + uint64_t filesize_before; + MDBX_meta meta; + int rc = MDBX_RESULT_FALSE; + int err = mdbx_read_header(env, &meta, &filesize_before, lck_rc, mode_bits); + if (unlikely(err != MDBX_SUCCESS)) { + if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA || + (env->me_flags & MDBX_RDONLY) != 0 || + /* recovery mode */ env->me_stuck_meta >= 0) + return err; + + mdbx_debug("%s", "create new database"); + rc = /* new database */ MDBX_RESULT_TRUE; + + if (!env->me_dbgeo.now) { + /* set defaults if not configured */ + err = mdbx_env_set_geometry(env, 0, -1, DEFAULT_MAPSIZE, -1, -1, -1); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + + void *buffer = mdbx_calloc(NUM_METAS, env->me_psize); + if (!buffer) + return MDBX_ENOMEM; + + meta = *mdbx_init_metas(env, buffer); + err = mdbx_pwrite(env->me_lazy_fd, buffer, env->me_psize * NUM_METAS, 0); + mdbx_free(buffer); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + err = mdbx_ftruncate(env->me_lazy_fd, filesize_before = env->me_dbgeo.now); + if (unlikely(err != MDBX_SUCCESS)) + return err; + +#ifndef NDEBUG /* just for checking */ + err = mdbx_read_header(env, &meta, &filesize_before, lck_rc, mode_bits); + if (unlikely(err != MDBX_SUCCESS)) + return err; +#endif + } + + mdbx_verbose( + "header: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO "/%" PRIaPGNO + "-%" PRIaPGNO "/%" PRIaPGNO " +%u -%u, txn_id %" PRIaTXN ", %s", + meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root, + meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now, meta.mm_geo.upper, + pv2pages(meta.mm_geo.grow_pv), pv2pages(meta.mm_geo.shrink_pv), + unaligned_peek_u64(4, meta.mm_txnid_a), mdbx_durable_str(&meta)); + + mdbx_setup_pagesize(env, meta.mm_psize); + const size_t used_bytes = pgno2bytes(env, meta.mm_geo.next); + const size_t used_aligned2os_bytes = + ceil_powerof2(used_bytes, env->me_os_psize); + if ((env->me_flags & MDBX_RDONLY) /* readonly */ + || lck_rc != MDBX_RESULT_TRUE /* not exclusive */ + || /* recovery mode */ env->me_stuck_meta >= 0) { + /* use present params from db */ + const size_t pagesize = meta.mm_psize; + err = mdbx_env_set_geometry( + env, meta.mm_geo.lower * pagesize, meta.mm_geo.now * pagesize, + meta.mm_geo.upper * pagesize, pv2pages(meta.mm_geo.grow_pv) * pagesize, + pv2pages(meta.mm_geo.shrink_pv) * pagesize, meta.mm_psize); + if (unlikely(err != MDBX_SUCCESS)) { + mdbx_error("%s: err %d", "could not apply preconfigured geometry from db", + err); + return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err; + } + } else if (env->me_dbgeo.now) { + /* silently growth to last used page */ + if (env->me_dbgeo.now < used_aligned2os_bytes) + env->me_dbgeo.now = used_aligned2os_bytes; + if (env->me_dbgeo.upper < used_aligned2os_bytes) + env->me_dbgeo.upper = used_aligned2os_bytes; + + /* apply preconfigured params, but only if substantial changes: + * - upper or lower limit changes + * - shrink threshold or growth step + * But ignore change just a 'now/current' size. */ + if (bytes_align2os_bytes(env, env->me_dbgeo.upper) != + pgno2bytes(env, meta.mm_geo.upper) || + bytes_align2os_bytes(env, env->me_dbgeo.lower) != + pgno2bytes(env, meta.mm_geo.lower) || + bytes_align2os_bytes(env, env->me_dbgeo.shrink) != + pgno2bytes(env, pv2pages(meta.mm_geo.shrink_pv)) || + bytes_align2os_bytes(env, env->me_dbgeo.grow) != + pgno2bytes(env, pv2pages(meta.mm_geo.grow_pv))) { + + if (env->me_dbgeo.shrink && env->me_dbgeo.now > used_bytes) + /* pre-shrink if enabled */ + env->me_dbgeo.now = used_bytes + env->me_dbgeo.shrink - + used_bytes % env->me_dbgeo.shrink; + + err = mdbx_env_set_geometry(env, env->me_dbgeo.lower, env->me_dbgeo.now, + env->me_dbgeo.upper, env->me_dbgeo.grow, + env->me_dbgeo.shrink, meta.mm_psize); + if (unlikely(err != MDBX_SUCCESS)) { + mdbx_error("%s: err %d", "could not apply preconfigured db-geometry", + err); + return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err; + } + + /* update meta fields */ + meta.mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); + meta.mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower); + meta.mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper); + meta.mm_geo.grow_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.grow)); + meta.mm_geo.shrink_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.shrink)); + + mdbx_verbose("amended: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO + "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO + " +%u -%u, txn_id %" PRIaTXN ", %s", + meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root, + meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now, + meta.mm_geo.upper, pv2pages(meta.mm_geo.grow_pv), + pv2pages(meta.mm_geo.shrink_pv), + unaligned_peek_u64(4, meta.mm_txnid_a), + mdbx_durable_str(&meta)); + } else { + /* fetch back 'now/current' size, since it was ignored during comparison + * and may differ. */ + env->me_dbgeo.now = pgno_align2os_bytes(env, meta.mm_geo.now); + } + mdbx_ensure(env, meta.mm_geo.now >= meta.mm_geo.next); + } else { + /* geo-params are not pre-configured by user, + * get current values from the meta. */ + env->me_dbgeo.now = pgno2bytes(env, meta.mm_geo.now); + env->me_dbgeo.lower = pgno2bytes(env, meta.mm_geo.lower); + env->me_dbgeo.upper = pgno2bytes(env, meta.mm_geo.upper); + env->me_dbgeo.grow = pgno2bytes(env, pv2pages(meta.mm_geo.grow_pv)); + env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(meta.mm_geo.shrink_pv)); + } + + mdbx_ensure(env, + pgno_align2os_bytes(env, meta.mm_geo.now) == env->me_dbgeo.now); + mdbx_ensure(env, env->me_dbgeo.now >= used_bytes); + if (unlikely(filesize_before != env->me_dbgeo.now)) { + if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { + mdbx_verbose("filesize mismatch (expect %" PRIuPTR "b/%" PRIaPGNO + "p, have %" PRIu64 "b/%" PRIaPGNO "p), " + "assume other process working", + env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now), + filesize_before, bytes2pgno(env, (size_t)filesize_before)); + } else { + mdbx_warning("filesize mismatch (expect %" PRIuSIZE "b/%" PRIaPGNO + "p, have %" PRIu64 "b/%" PRIaPGNO "p)", + env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now), + filesize_before, bytes2pgno(env, (size_t)filesize_before)); + if (filesize_before < used_bytes) { + mdbx_error("last-page beyond end-of-file (last %" PRIaPGNO + ", have %" PRIaPGNO ")", + meta.mm_geo.next, bytes2pgno(env, (size_t)filesize_before)); + return MDBX_CORRUPTED; + } + + if (env->me_flags & MDBX_RDONLY) { + if (filesize_before & (env->me_os_psize - 1)) { + mdbx_error("%s", "filesize should be rounded-up to system page"); + return MDBX_WANNA_RECOVERY; + } + mdbx_warning("%s", "ignore filesize mismatch in readonly-mode"); + } else { + mdbx_verbose("will resize datafile to %" PRIuSIZE " bytes, %" PRIaPGNO + " pages", + env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now)); + } + } + } + + mdbx_verbose("current boot-id %" PRIx64 "-%" PRIx64 " (%savailable)", + bootid.x, bootid.y, (bootid.x | bootid.y) ? "" : "not-"); + +#if MDBX_ENABLE_MADVISE + /* calculate readahead hint before mmap with zero redundant pages */ + const bool readahead = + !(env->me_flags & MDBX_NORDAHEAD) && + mdbx_is_readahead_reasonable(used_bytes, 0) == MDBX_RESULT_TRUE; +#endif /* MDBX_ENABLE_MADVISE */ + + err = mdbx_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, + env->me_dbgeo.upper, lck_rc ? MMAP_OPTION_TRUNCATE : 0); + if (unlikely(err != MDBX_SUCCESS)) + return err; + +#if MDBX_ENABLE_MADVISE +#if defined(MADV_DONTDUMP) + err = madvise(env->me_map, env->me_dxb_mmap.limit, MADV_DONTDUMP) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#endif /* MADV_DONTDUMP */ +#if defined(MADV_DODUMP) + if (mdbx_runtime_flags & MDBX_DBG_DUMP) { + const size_t meta_length_aligned2os = pgno_align2os_bytes(env, NUM_METAS); + err = madvise(env->me_map, meta_length_aligned2os, MADV_DODUMP) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; + } +#endif /* MADV_DODUMP */ +#endif /* MDBX_ENABLE_MADVISE */ + +#ifdef MDBX_USE_VALGRIND + env->me_valgrind_handle = + VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx"); +#endif /* MDBX_USE_VALGRIND */ + + mdbx_assert(env, used_bytes >= pgno2bytes(env, NUM_METAS) && + used_bytes <= env->me_dxb_mmap.limit); +#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) + VALGRIND_MAKE_MEM_NOACCESS(env->me_map + used_bytes, + env->me_dxb_mmap.limit - used_bytes); + ASAN_POISON_MEMORY_REGION(env->me_map + used_bytes, + env->me_dxb_mmap.limit - used_bytes); + env->me_poison_edge = bytes2pgno(env, env->me_dxb_mmap.limit); +#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ + + while (likely(/* not recovery mode */ env->me_stuck_meta < 0)) { + const unsigned meta_clash_mask = mdbx_meta_eq_mask(env); + if (unlikely(meta_clash_mask)) { + if (/* not recovery mode */ env->me_stuck_meta < 0) { + mdbx_error("meta-pages are clashed: mask 0x%d", meta_clash_mask); + return MDBX_CORRUPTED; + } else { + mdbx_warning("ignore meta-pages clashing (mask 0x%d) in recovery mode", + meta_clash_mask); + } + } + + MDBX_meta *const head = mdbx_meta_head(env); + const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, head); + MDBX_meta *const steady = mdbx_meta_steady(env); + const txnid_t steady_txnid = mdbx_meta_txnid_fluid(env, steady); + if (head_txnid == steady_txnid) + break; + + if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { + mdbx_assert(env, META_IS_STEADY(steady) && !META_IS_STEADY(head)); + if (meta_bootid_match(head)) { + MDBX_meta clone = *head; + uint64_t filesize = env->me_dbgeo.now; + err = mdbx_validate_meta( + env, &clone, &filesize, data_page(head), + bytes2pgno(env, (uint8_t *)data_page(head) - env->me_map), nullptr, + env->me_psize); + if (err == MDBX_SUCCESS) { + mdbx_warning( + "opening after an unclean shutdown, but boot-id(%016" PRIx64 + "-%016" PRIx64 + ") is MATCH: rollback NOT needed, steady-sync NEEDED%s", + bootid.x, bootid.y, + (env->me_flags & MDBX_RDONLY) ? ", but unable in read-only mode" + : ""); + if (env->me_flags & MDBX_RDONLY) + return MDBX_WANNA_RECOVERY /* LY: could not recovery/sync */; + meta = clone; + atomic_store32(&env->me_lck->mti_unsynced_pages, meta.mm_geo.next, + mo_Relaxed); + break; + } + mdbx_warning("opening after an unclean shutdown, " + "but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH, " + "but last meta not valid, rollback needed", + bootid.x, bootid.y); + } + if (env->me_flags & MDBX_RDONLY) { + mdbx_error("rollback needed: (from head %" PRIaTXN + " to steady %" PRIaTXN "), but unable in read-only mode", + head_txnid, steady_txnid); + return MDBX_WANNA_RECOVERY /* LY: could not recovery/rollback */; + } + + const MDBX_meta *const meta0 = METAPAGE(env, 0); + const MDBX_meta *const meta1 = METAPAGE(env, 1); + const MDBX_meta *const meta2 = METAPAGE(env, 2); + txnid_t undo_txnid = 0 /* zero means undo is unneeded */; + while ( + (head != meta0 && mdbx_meta_txnid_fluid(env, meta0) == undo_txnid) || + (head != meta1 && mdbx_meta_txnid_fluid(env, meta1) == undo_txnid) || + (head != meta2 && mdbx_meta_txnid_fluid(env, meta2) == undo_txnid)) + undo_txnid = safe64_txnid_next(undo_txnid); + if (unlikely(undo_txnid >= steady_txnid)) { + mdbx_fatal("rollback failed: no suitable txnid (0,1,2) < %" PRIaTXN, + steady_txnid); + return MDBX_PANIC /* LY: could not recovery/rollback */; + } + + /* LY: rollback weak checkpoint */ + mdbx_notice("rollback: from %" PRIaTXN ", to %" PRIaTXN " as %" PRIaTXN, + head_txnid, steady_txnid, undo_txnid); + mdbx_ensure(env, head_txnid == mdbx_meta_txnid_stable(env, head)); + +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ + if (env->me_flags & MDBX_WRITEMAP) { + /* It is possible to update txnid without safe64_write(), + * since DB opened exclusive for now */ + unaligned_poke_u64(4, head->mm_txnid_a, undo_txnid); + unaligned_poke_u64(4, head->mm_datasync_sign, MDBX_DATASIGN_WEAK); + unaligned_poke_u64(4, head->mm_txnid_b, undo_txnid); + const size_t offset = (uint8_t *)data_page(head) - env->me_dxb_mmap.dxb; + const size_t paged_offset = floor_powerof2(offset, env->me_os_psize); + const size_t paged_length = ceil_powerof2( + env->me_psize + offset - paged_offset, env->me_os_psize); + err = mdbx_msync(&env->me_dxb_mmap, paged_offset, paged_length, + MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } else { + MDBX_meta rollback = *head; + mdbx_meta_set_txnid(env, &rollback, undo_txnid); + unaligned_poke_u64(4, rollback.mm_datasync_sign, MDBX_DATASIGN_WEAK); + const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) + ? env->me_dsync_fd + : env->me_lazy_fd; + err = mdbx_pwrite(fd, &rollback, sizeof(MDBX_meta), + (uint8_t *)head - (uint8_t *)env->me_map); + if (err == MDBX_SUCCESS && fd == env->me_lazy_fd) + err = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } + if (err) { + mdbx_error("error %d rollback from %" PRIaTXN ", to %" PRIaTXN + " as %" PRIaTXN, + err, head_txnid, steady_txnid, undo_txnid); + return err; + } + + mdbx_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), + env->me_os_psize); + mdbx_ensure(env, undo_txnid == mdbx_meta_txnid_fluid(env, head)); + mdbx_ensure(env, 0 == mdbx_meta_eq_mask(env)); + continue; + } + + if (!env->me_lck_mmap.lck) { + /* LY: without-lck (read-only) mode, so it is impossible that other + * process made weak checkpoint. */ + mdbx_error("%s", "without-lck, unable recovery/rollback"); + return MDBX_WANNA_RECOVERY; + } + + /* LY: assume just have a collision with other running process, + * or someone make a weak checkpoint */ + mdbx_verbose("%s", "assume collision or online weak checkpoint"); + break; + } + + const MDBX_meta *head = mdbx_meta_head(env); + if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { + /* re-check size after mmap */ + if ((env->me_dxb_mmap.current & (env->me_os_psize - 1)) != 0 || + env->me_dxb_mmap.current < used_bytes) { + mdbx_error("unacceptable/unexpected datafile size %" PRIuPTR, + env->me_dxb_mmap.current); + return MDBX_PROBLEM; + } + if (env->me_dxb_mmap.current != env->me_dbgeo.now) { + meta.mm_geo.now = bytes2pgno(env, env->me_dxb_mmap.current); + mdbx_notice("need update meta-geo to filesize %" PRIuPTR + " bytes, %" PRIaPGNO " pages", + env->me_dxb_mmap.current, meta.mm_geo.now); + } + + if (memcmp(&meta.mm_geo, &head->mm_geo, sizeof(meta.mm_geo))) { + if ((env->me_flags & MDBX_RDONLY) != 0 || + /* recovery mode */ env->me_stuck_meta >= 0) { + mdbx_warning( + "skipped update meta.geo in %s mode: from l%" PRIaPGNO + "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u, to l%" PRIaPGNO + "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u", + (env->me_stuck_meta < 0) ? "read-only" : "recovery", + head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper, + pv2pages(head->mm_geo.shrink_pv), pv2pages(head->mm_geo.grow_pv), + meta.mm_geo.lower, meta.mm_geo.now, meta.mm_geo.upper, + pv2pages(meta.mm_geo.shrink_pv), pv2pages(meta.mm_geo.grow_pv)); + } else { + const txnid_t txnid = mdbx_meta_txnid_stable(env, head); + const txnid_t next_txnid = safe64_txnid_next(txnid); + if (unlikely(txnid > MAX_TXNID)) { + mdbx_error("txnid overflow, raise %d", MDBX_TXN_FULL); + return MDBX_TXN_FULL; + } + mdbx_notice("updating meta.geo: " + "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN "), " + "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN ")", + head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper, + pv2pages(head->mm_geo.shrink_pv), + pv2pages(head->mm_geo.grow_pv), txnid, meta.mm_geo.lower, + meta.mm_geo.now, meta.mm_geo.upper, + pv2pages(meta.mm_geo.shrink_pv), + pv2pages(meta.mm_geo.grow_pv), next_txnid); + + mdbx_ensure(env, mdbx_meta_eq(env, &meta, head)); + mdbx_meta_set_txnid(env, &meta, next_txnid); + err = mdbx_sync_locked(env, env->me_flags | MDBX_SHRINK_ALLOWED, &meta); + if (err) { + mdbx_error("error %d, while updating meta.geo: " + "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN "), " + "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN ")", + err, head->mm_geo.lower, head->mm_geo.now, + head->mm_geo.upper, pv2pages(head->mm_geo.shrink_pv), + pv2pages(head->mm_geo.grow_pv), txnid, meta.mm_geo.lower, + meta.mm_geo.now, meta.mm_geo.upper, + pv2pages(meta.mm_geo.shrink_pv), + pv2pages(meta.mm_geo.grow_pv), next_txnid); + return err; + } + } + } + } + + atomic_store32(&env->me_lck->mti_discarded_tail, + bytes2pgno(env, used_aligned2os_bytes), mo_Relaxed); +#if MDBX_ENABLE_MADVISE + if (lck_rc && used_aligned2os_bytes < env->me_dxb_mmap.current) { +#if defined(MADV_REMOVE) + if ((env->me_flags & MDBX_WRITEMAP) != 0 && + /* not recovery mode */ env->me_stuck_meta < 0) { + mdbx_notice("open-MADV_%s %u..%u", "REMOVE (deallocate file space)", + env->me_lck->mti_discarded_tail.weak, + bytes2pgno(env, env->me_dxb_mmap.current)); + err = + madvise(env->me_map + used_aligned2os_bytes, + env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_REMOVE) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; + } +#endif /* MADV_REMOVE */ +#if defined(MADV_DONTNEED) + mdbx_notice("open-MADV_%s %u..%u", "DONTNEED", + env->me_lck->mti_discarded_tail.weak, + bytes2pgno(env, env->me_dxb_mmap.current)); + err = + madvise(env->me_map + used_aligned2os_bytes, + env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_DONTNEED) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(POSIX_MADV_DONTNEED) + err = ignore_enosys(posix_madvise( + env->me_map + used_aligned2os_bytes, + env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_MADV_DONTNEED)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(POSIX_FADV_DONTNEED) + err = ignore_enosys(posix_fadvise( + env->me_lazy_fd, used_aligned2os_bytes, + env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_FADV_DONTNEED)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#endif /* MADV_DONTNEED */ + } + + err = mdbx_set_readahead(env, bytes2pgno(env, used_bytes), readahead, true); + if (unlikely(err != MDBX_SUCCESS)) + return err; +#endif /* MDBX_ENABLE_MADVISE */ + + return rc; +} + +/******************************************************************************/ + +/* Open and/or initialize the lock region for the environment. */ +static __cold int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, + mdbx_mode_t mode) { + mdbx_assert(env, env->me_lazy_fd != INVALID_HANDLE_VALUE); + mdbx_assert(env, env->me_lfd == INVALID_HANDLE_VALUE); + + int err = mdbx_openfile(MDBX_OPEN_LCK, env, lck_pathname, &env->me_lfd, mode); + if (err != MDBX_SUCCESS) { + if (!(err == MDBX_ENOFILE && (env->me_flags & MDBX_EXCLUSIVE)) && + !((err == MDBX_EROFS || err == MDBX_EACCESS || err == MDBX_EPERM) && + (env->me_flags & MDBX_RDONLY))) + return err; + + /* ensure the file system is read-only */ + err = mdbx_check_fs_rdonly(env->me_lazy_fd, lck_pathname, err); + if (err != MDBX_SUCCESS && + /* ignore ERROR_NOT_SUPPORTED for exclusive mode */ + !(err == MDBX_ENOSYS && (env->me_flags & MDBX_EXCLUSIVE))) + return err; + + /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ + /* beginning of a locked section ---------------------------------------- */ + lcklist_lock(); + mdbx_assert(env, env->me_lcklist_next == nullptr); + env->me_lfd = INVALID_HANDLE_VALUE; + const int rc = mdbx_lck_seize(env); + if (MDBX_IS_ERROR(rc)) { + /* Calling lcklist_detach_locked() is required to restore POSIX-filelock + * and this job will be done by mdbx_env_close0(). */ + lcklist_unlock(); + return rc; + } + /* insert into inprocess lck-list */ + env->me_lcklist_next = inprocess_lcklist_head; + inprocess_lcklist_head = env; + lcklist_unlock(); + /* end of a locked section ---------------------------------------------- */ + + env->me_lck = lckless_stub(env); + env->me_maxreaders = UINT_MAX; + mdbx_debug("lck-setup:%s%s%s", " lck-less", + (env->me_flags & MDBX_RDONLY) ? " readonly" : "", + (rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); + return rc; + } + + /* beginning of a locked section ------------------------------------------ */ + lcklist_lock(); + mdbx_assert(env, env->me_lcklist_next == nullptr); + + /* Try to get exclusive lock. If we succeed, then + * nobody is using the lock region and we should initialize it. */ + err = mdbx_lck_seize(env); + if (MDBX_IS_ERROR(err)) { + bailout: + /* Calling lcklist_detach_locked() is required to restore POSIX-filelock + * and this job will be done by mdbx_env_close0(). */ + lcklist_unlock(); + return err; + } + + MDBX_env *inprocess_neighbor = nullptr; + if (err == MDBX_RESULT_TRUE) { + err = uniq_check(&env->me_lck_mmap, &inprocess_neighbor); + if (MDBX_IS_ERROR(err)) + goto bailout; + if (inprocess_neighbor && + ((mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 || + (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) != 0)) { + err = MDBX_BUSY; + goto bailout; + } + } + const int lck_seize_rc = err; + + mdbx_debug("lck-setup:%s%s%s", " with-lck", + (env->me_flags & MDBX_RDONLY) ? " readonly" : "", + (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" + : " cooperative"); + + uint64_t size = 0; + err = mdbx_filesize(env->me_lfd, &size); + if (unlikely(err != MDBX_SUCCESS)) + goto bailout; + + if (lck_seize_rc == MDBX_RESULT_TRUE) { + size = ceil_powerof2(env->me_maxreaders * sizeof(MDBX_reader) + + sizeof(MDBX_lockinfo), + env->me_os_psize); + mdbx_jitter4testing(false); + } else { + if (env->me_flags & MDBX_EXCLUSIVE) { + err = MDBX_BUSY; + goto bailout; + } + if (size > INT_MAX || (size & (env->me_os_psize - 1)) != 0 || + size < env->me_os_psize) { + mdbx_error("lck-file has invalid size %" PRIu64 " bytes", size); + err = MDBX_PROBLEM; + goto bailout; + } + } + + const size_t maxreaders = + ((size_t)size - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader); + if (maxreaders < 4) { + mdbx_error("lck-size too small (up to %" PRIuPTR " readers)", maxreaders); + err = MDBX_PROBLEM; + goto bailout; + } + env->me_maxreaders = (maxreaders <= MDBX_READERS_LIMIT) + ? (unsigned)maxreaders + : (unsigned)MDBX_READERS_LIMIT; + + err = mdbx_mmap((env->me_flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, + &env->me_lck_mmap, (size_t)size, (size_t)size, + lck_seize_rc ? MMAP_OPTION_TRUNCATE | MMAP_OPTION_SEMAPHORE + : MMAP_OPTION_SEMAPHORE); + if (unlikely(err != MDBX_SUCCESS)) + goto bailout; + +#if MDBX_ENABLE_MADVISE +#ifdef MADV_DODUMP + err = madvise(env->me_lck_mmap.lck, size, MADV_DODUMP) ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + goto bailout; +#endif /* MADV_DODUMP */ + +#ifdef MADV_WILLNEED + err = madvise(env->me_lck_mmap.lck, size, MADV_WILLNEED) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + goto bailout; +#elif defined(POSIX_MADV_WILLNEED) + err = ignore_enosys( + posix_madvise(env->me_lck_mmap.lck, size, POSIX_MADV_WILLNEED)); + if (unlikely(MDBX_IS_ERROR(err))) + goto bailout; +#endif /* MADV_WILLNEED */ +#endif /* MDBX_ENABLE_MADVISE */ + + struct MDBX_lockinfo *const lck = env->me_lck_mmap.lck; + if (lck_seize_rc == MDBX_RESULT_TRUE) { + /* LY: exclusive mode, check and reset lck content */ + memset(lck, 0, (size_t)size); + mdbx_jitter4testing(false); + lck->mti_magic_and_version = MDBX_LOCK_MAGIC; + lck->mti_os_and_format = MDBX_LOCK_FORMAT; +#if MDBX_ENABLE_PGOP_STAT + lck->mti_pgop_stat.wops.weak = 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + err = mdbx_msync(&env->me_lck_mmap, 0, (size_t)size, MDBX_SYNC_NONE); + if (unlikely(err != MDBX_SUCCESS)) { + mdbx_error("initial-%s for lck-file failed", "msync"); + goto bailout; + } + err = mdbx_fsync(env->me_lck_mmap.fd, MDBX_SYNC_SIZE); + if (unlikely(err != MDBX_SUCCESS)) { + mdbx_error("initial-%s for lck-file failed", "fsync"); + goto bailout; + } + } else { + if (lck->mti_magic_and_version != MDBX_LOCK_MAGIC) { + mdbx_error("%s", "lock region has invalid magic/version"); + err = ((lck->mti_magic_and_version >> 8) != MDBX_MAGIC) + ? MDBX_INVALID + : MDBX_VERSION_MISMATCH; + goto bailout; + } + if (lck->mti_os_and_format != MDBX_LOCK_FORMAT) { + mdbx_error("lock region has os/format 0x%" PRIx32 ", expected 0x%" PRIx32, + lck->mti_os_and_format, MDBX_LOCK_FORMAT); + err = MDBX_VERSION_MISMATCH; + goto bailout; + } + } + + err = mdbx_lck_init(env, inprocess_neighbor, lck_seize_rc); + if (MDBX_IS_ERROR(err)) + goto bailout; + + mdbx_ensure(env, env->me_lcklist_next == nullptr); + /* insert into inprocess lck-list */ + env->me_lcklist_next = inprocess_lcklist_head; + inprocess_lcklist_head = env; + lcklist_unlock(); + /* end of a locked section ------------------------------------------------ */ + + mdbx_assert(env, !MDBX_IS_ERROR(lck_seize_rc)); + env->me_lck = lck; + return lck_seize_rc; +} + +__cold int mdbx_is_readahead_reasonable(size_t volume, intptr_t redundancy) { + if (volume <= 1024 * 1024 * 4ul) + return MDBX_RESULT_TRUE; + + intptr_t pagesize, total_ram_pages; + int err = mdbx_get_sysraminfo(&pagesize, &total_ram_pages, nullptr); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + const int log2page = log2n_powerof2(pagesize); + const intptr_t volume_pages = (volume + pagesize - 1) >> log2page; + const intptr_t redundancy_pages = + (redundancy < 0) ? -(intptr_t)((-redundancy + pagesize - 1) >> log2page) + : (intptr_t)(redundancy + pagesize - 1) >> log2page; + if (volume_pages >= total_ram_pages || + volume_pages + redundancy_pages >= total_ram_pages) + return MDBX_RESULT_FALSE; + + intptr_t avail_ram_pages; + err = mdbx_get_sysraminfo(nullptr, nullptr, &avail_ram_pages); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + return (volume_pages + redundancy_pages >= avail_ram_pages) + ? MDBX_RESULT_FALSE + : MDBX_RESULT_TRUE; +} + +/* Merge sync flags */ +static uint32_t merge_sync_flags(const uint32_t a, const uint32_t b) { + uint32_t r = a | b; + + /* avoid false MDBX_UTTERLY_NOSYNC */ + if (F_ISSET(r, MDBX_UTTERLY_NOSYNC) && !F_ISSET(a, MDBX_UTTERLY_NOSYNC) && + !F_ISSET(b, MDBX_UTTERLY_NOSYNC)) + r = (r - MDBX_UTTERLY_NOSYNC) | MDBX_SAFE_NOSYNC; + + /* convert MDBX_DEPRECATED_MAPASYNC to MDBX_SAFE_NOSYNC */ + if ((r & (MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC)) == + (MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC) && + !F_ISSET(r, MDBX_UTTERLY_NOSYNC)) + r = (r - MDBX_DEPRECATED_MAPASYNC) | MDBX_SAFE_NOSYNC; + + /* force MDBX_NOMETASYNC if MDBX_SAFE_NOSYNC enabled */ + if (r & MDBX_SAFE_NOSYNC) + r |= MDBX_NOMETASYNC; + + assert(!(F_ISSET(r, MDBX_UTTERLY_NOSYNC) && + !F_ISSET(a, MDBX_UTTERLY_NOSYNC) && + !F_ISSET(b, MDBX_UTTERLY_NOSYNC))); + return r; +} + +__cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target_meta) { + if (unlikely(target_meta >= NUM_METAS)) + return MDBX_EINVAL; + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely((env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != + MDBX_EXCLUSIVE)) + return MDBX_EPERM; + + MDBX_page *page = + (env->me_flags & MDBX_WRITEMAP) + ? pgno2page(env, target_meta) + : memcpy(env->me_pbuf, pgno2page(env, target_meta), env->me_psize); + page->mp_pgno = target_meta; + page->mp_flags = P_META; + + MDBX_meta *meta = page_meta(page); + unaligned_poke_u64(4, meta->mm_magic_and_version, MDBX_DATA_MAGIC); + meta->mm_psize = env->me_psize; + txnid_t txnid = mdbx_meta_txnid_stable(env, meta); + const txnid_t txnid0 = mdbx_meta_txnid_stable(env, METAPAGE(env, 0)); + if (target_meta != 0 && txnid <= txnid0) + txnid = safe64_txnid_next(txnid0); + const txnid_t txnid1 = mdbx_meta_txnid_stable(env, METAPAGE(env, 1)); + if (target_meta != 1 && txnid <= txnid1) + txnid = safe64_txnid_next(txnid1); + const txnid_t txnid2 = mdbx_meta_txnid_stable(env, METAPAGE(env, 2)); + if (target_meta != 2 && txnid <= txnid2) + txnid = safe64_txnid_next(txnid2); + + if (!META_IS_STEADY(meta) || mdbx_recent_committed_txnid(env) != txnid) { + if (unlikely(txnid > MAX_TXNID)) { + mdbx_error("txnid overflow, raise %d", MDBX_TXN_FULL); + return MDBX_TXN_FULL; + } + mdbx_meta_set_txnid(env, meta, txnid); + unaligned_poke_u64(4, meta->mm_datasync_sign, mdbx_meta_sign(meta)); + } + +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ + if (env->me_flags & MDBX_WRITEMAP) { + mdbx_flush_incoherent_cpu_writeback(); + rc = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, target_meta), + MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } else { + const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) + ? env->me_dsync_fd + : env->me_lazy_fd; + rc = mdbx_pwrite(fd, page, env->me_psize, pgno2bytes(env, target_meta)); + if (rc == MDBX_SUCCESS && fd == env->me_lazy_fd) + rc = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } + + return rc; +} + +__cold int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, + unsigned target_meta, bool writeable) { + if (unlikely(target_meta >= NUM_METAS)) + return MDBX_EINVAL; + int rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + if (unlikely(env->me_map)) + return MDBX_EPERM; + + env->me_stuck_meta = (int8_t)target_meta; + return mdbx_env_open( + env, pathname, writeable ? MDBX_EXCLUSIVE : MDBX_EXCLUSIVE | MDBX_RDONLY, + 0); +} + +typedef struct { + void *buffer_for_free; + char *lck, *dxb; + size_t ent_len; +} MDBX_handle_env_pathname; + +__cold static int mdbx_handle_env_pathname(MDBX_handle_env_pathname *ctx, + const char *pathname, + MDBX_env_flags_t *flags, + const mdbx_mode_t mode) { + int rc; + memset(ctx, 0, sizeof(*ctx)); + if (unlikely(!pathname)) + return MDBX_EINVAL; + +#if defined(_WIN32) || defined(_WIN64) + const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); + if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) + return ERROR_INVALID_NAME; + wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); + if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) + return ERROR_INVALID_NAME; + + const DWORD dwAttrib = GetFileAttributesW(pathnameW); + if (dwAttrib == INVALID_FILE_ATTRIBUTES) { + rc = GetLastError(); + if (rc != MDBX_ENOFILE) + return rc; + if (mode == 0 || (*flags & MDBX_RDONLY) != 0) + /* can't open existing */ + return rc; + + /* auto-create directory if requested */ + if ((*flags & MDBX_NOSUBDIR) == 0 && + !CreateDirectoryW(pathnameW, nullptr)) { + rc = GetLastError(); + if (rc != ERROR_ALREADY_EXISTS) + return rc; + } + } else { + /* ignore passed MDBX_NOSUBDIR flag and set it automatically */ + *flags |= MDBX_NOSUBDIR; + if (dwAttrib & FILE_ATTRIBUTE_DIRECTORY) + *flags -= MDBX_NOSUBDIR; + } +#else + struct stat st; + if (stat(pathname, &st)) { + rc = errno; + if (rc != MDBX_ENOFILE) + return rc; + if (mode == 0 || (*flags & MDBX_RDONLY) != 0) + /* can't open existing */ + return rc; + + /* auto-create directory if requested */ + const mdbx_mode_t dir_mode = + (/* inherit read/write permissions for group and others */ mode & + (S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) | + /* always add read/write/search for owner */ S_IRWXU | + ((mode & S_IRGRP) ? /* +search if readable by group */ S_IXGRP : 0) | + ((mode & S_IROTH) ? /* +search if readable by others */ S_IXOTH : 0); + if ((*flags & MDBX_NOSUBDIR) == 0 && mkdir(pathname, dir_mode)) { + rc = errno; + if (rc != EEXIST) + return rc; + } + } else { + /* ignore passed MDBX_NOSUBDIR flag and set it automatically */ + *flags |= MDBX_NOSUBDIR; + if (S_ISDIR(st.st_mode)) + *flags -= MDBX_NOSUBDIR; + } +#endif + + static const char dxb_name[] = MDBX_DATANAME; + static const size_t dxb_name_len = sizeof(dxb_name) - 1; + static const char lck_name[] = MDBX_LOCKNAME; + static const char lock_suffix[] = MDBX_LOCK_SUFFIX; + + ctx->ent_len = strlen(pathname); + if ((*flags & MDBX_NOSUBDIR) && ctx->ent_len >= dxb_name_len && + !memcmp(dxb_name, pathname + ctx->ent_len - dxb_name_len, dxb_name_len)) { + *flags -= MDBX_NOSUBDIR; + ctx->ent_len -= dxb_name_len; + } + + const size_t bytes_needed = + ctx->ent_len * 2 + ((*flags & MDBX_NOSUBDIR) + ? sizeof(lock_suffix) + 1 + : sizeof(lck_name) + sizeof(dxb_name)); + ctx->buffer_for_free = mdbx_malloc(bytes_needed); + if (!ctx->buffer_for_free) + return MDBX_ENOMEM; + + ctx->lck = ctx->buffer_for_free; + if (*flags & MDBX_NOSUBDIR) { + ctx->dxb = ctx->lck + ctx->ent_len + sizeof(lock_suffix); + sprintf(ctx->lck, "%s%s", pathname, lock_suffix); + strcpy(ctx->dxb, pathname); + } else { + ctx->dxb = ctx->lck + ctx->ent_len + sizeof(lck_name); + sprintf(ctx->lck, "%.*s%s", (int)ctx->ent_len, pathname, lck_name); + sprintf(ctx->dxb, "%.*s%s", (int)ctx->ent_len, pathname, dxb_name); + } + + return MDBX_SUCCESS; +} + +__cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { + switch (mode) { + default: + return MDBX_EINVAL; + case MDBX_ENV_JUST_DELETE: + case MDBX_ENV_ENSURE_UNUSED: + case MDBX_ENV_WAIT_FOR_UNUSED: + break; + } + +#ifdef __e2k__ /* https://bugs.mcst.ru/bugzilla/show_bug.cgi?id=6011 */ + MDBX_env *const dummy_env = alloca(sizeof(MDBX_env)); +#else + MDBX_env dummy_env_silo, *const dummy_env = &dummy_env_silo; +#endif + memset(dummy_env, 0, sizeof(*dummy_env)); + dummy_env->me_flags = + (mode == MDBX_ENV_ENSURE_UNUSED) ? MDBX_EXCLUSIVE : MDBX_ENV_DEFAULTS; + dummy_env->me_os_psize = (unsigned)mdbx_syspagesize(); + dummy_env->me_psize = (unsigned)mdbx_default_pagesize(); + dummy_env->me_pathname = (char *)pathname; + + MDBX_handle_env_pathname env_pathname; + STATIC_ASSERT(sizeof(dummy_env->me_flags) == sizeof(MDBX_env_flags_t)); + int rc = MDBX_RESULT_TRUE, + err = mdbx_handle_env_pathname( + &env_pathname, pathname, (MDBX_env_flags_t *)&dummy_env->me_flags, 0); + if (likely(err == MDBX_SUCCESS)) { + mdbx_filehandle_t clk_handle = INVALID_HANDLE_VALUE, + dxb_handle = INVALID_HANDLE_VALUE; + if (mode > MDBX_ENV_JUST_DELETE) { + err = mdbx_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.dxb, + &dxb_handle, 0); + err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; + if (err == MDBX_SUCCESS) { + err = mdbx_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.lck, + &clk_handle, 0); + err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; + } + if (err == MDBX_SUCCESS && clk_handle != INVALID_HANDLE_VALUE) + err = mdbx_lockfile(clk_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); + if (err == MDBX_SUCCESS && dxb_handle != INVALID_HANDLE_VALUE) + err = mdbx_lockfile(dxb_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); + } + + if (err == MDBX_SUCCESS) { + err = mdbx_removefile(env_pathname.dxb); + if (err == MDBX_SUCCESS) + rc = MDBX_SUCCESS; + else if (err == MDBX_ENOFILE) + err = MDBX_SUCCESS; + } + + if (err == MDBX_SUCCESS) { + err = mdbx_removefile(env_pathname.lck); + if (err == MDBX_SUCCESS) + rc = MDBX_SUCCESS; + else if (err == MDBX_ENOFILE) + err = MDBX_SUCCESS; + } + + if (err == MDBX_SUCCESS && !(dummy_env->me_flags & MDBX_NOSUBDIR)) { + err = mdbx_removedirectory(pathname); + if (err == MDBX_SUCCESS) + rc = MDBX_SUCCESS; + else if (err == MDBX_ENOFILE) + err = MDBX_SUCCESS; + } + + if (dxb_handle != INVALID_HANDLE_VALUE) + mdbx_closefile(dxb_handle); + if (clk_handle != INVALID_HANDLE_VALUE) + mdbx_closefile(clk_handle); + } else if (err == MDBX_ENOFILE) + err = MDBX_SUCCESS; + + mdbx_free(env_pathname.buffer_for_free); + return (err == MDBX_SUCCESS) ? rc : err; +} + +__cold int mdbx_env_open(MDBX_env *env, const char *pathname, + MDBX_env_flags_t flags, mdbx_mode_t mode) { + int rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(flags & ~ENV_USABLE_FLAGS)) + return MDBX_EINVAL; + + if (flags & MDBX_RDONLY) + mode = 0; + + if (env->me_lazy_fd != INVALID_HANDLE_VALUE || + (env->me_flags & MDBX_ENV_ACTIVE) != 0 || env->me_map) + return MDBX_EPERM; + + /* pickup previously mdbx_env_set_flags(), + * but avoid MDBX_UTTERLY_NOSYNC by disjunction */ + const uint32_t saved_me_flags = env->me_flags; + flags = merge_sync_flags(flags, env->me_flags); + + MDBX_handle_env_pathname env_pathname; + rc = mdbx_handle_env_pathname(&env_pathname, pathname, &flags, mode); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + if (flags & MDBX_RDONLY) { + /* LY: silently ignore irrelevant flags when + * we're only getting read access */ + flags &= ~(MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC | MDBX_SAFE_NOSYNC | + MDBX_NOMETASYNC | MDBX_COALESCE | MDBX_LIFORECLAIM | + MDBX_NOMEMINIT | MDBX_ACCEDE); + } else { +#if MDBX_MMAP_INCOHERENT_FILE_WRITE + /* Temporary `workaround` for OpenBSD kernel's flaw. + * See https://github.com/erthink/libmdbx/issues/67 */ + if ((flags & MDBX_WRITEMAP) == 0) { + if (flags & MDBX_ACCEDE) + flags |= MDBX_WRITEMAP; + else { + mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, + "System (i.e. OpenBSD) requires MDBX_WRITEMAP because " + "of an internal flaw(s) in a file/buffer/page cache.\n"); + rc = 42 /* ENOPROTOOPT */; + goto bailout; + } + } +#endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ + } + + env->me_flags = (flags & ~MDBX_FATAL_ERROR) | MDBX_ENV_ACTIVE; + env->me_pathname = mdbx_calloc(env_pathname.ent_len + 1, 1); + env->me_dbxs = mdbx_calloc(env->me_maxdbs, sizeof(MDBX_dbx)); + env->me_dbflags = mdbx_calloc(env->me_maxdbs, sizeof(env->me_dbflags[0])); + env->me_dbiseqs = mdbx_calloc(env->me_maxdbs, sizeof(env->me_dbiseqs[0])); + if (!(env->me_dbxs && env->me_pathname && env->me_dbflags && + env->me_dbiseqs)) { + rc = MDBX_ENOMEM; + goto bailout; + } + memcpy(env->me_pathname, env_pathname.dxb, env_pathname.ent_len); + env->me_dbxs[FREE_DBI].md_cmp = cmp_int_align4; /* aligned MDBX_INTEGERKEY */ + env->me_dbxs[FREE_DBI].md_dcmp = cmp_lenfast; + + rc = mdbx_openfile(F_ISSET(flags, MDBX_RDONLY) ? MDBX_OPEN_DXB_READ + : MDBX_OPEN_DXB_LAZY, + env, env_pathname.dxb, &env->me_lazy_fd, mode); + if (rc != MDBX_SUCCESS) + goto bailout; + + mdbx_assert(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); + if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC)) == 0) { + rc = mdbx_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, + &env->me_dsync_fd, 0); + mdbx_ensure(env, (rc != MDBX_SUCCESS) == + (env->me_dsync_fd == INVALID_HANDLE_VALUE)); + } + +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + env->me_sysv_ipc.key = ftok(env_pathname.dxb, 42); + if (env->me_sysv_ipc.key == -1) { + rc = errno; + goto bailout; + } +#endif /* MDBX_LOCKING */ + +#if !(defined(_WIN32) || defined(_WIN64)) + if (mode == 0) { + /* pickup mode for lck-file */ + struct stat st; + if (fstat(env->me_lazy_fd, &st)) { + rc = errno; + goto bailout; + } + mode = st.st_mode; + } + mode = (/* inherit read permissions for group and others */ mode & + (S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) | + /* always add read/write/search for owner */ S_IRUSR | S_IWUSR | + ((mode & S_IRGRP) ? /* +write if readable by group */ S_IWGRP : 0) | + ((mode & S_IROTH) ? /* +write if readable by others */ S_IWOTH : 0); +#endif /* !Windows */ + const int lck_rc = mdbx_setup_lck(env, env_pathname.lck, mode); + if (MDBX_IS_ERROR(lck_rc)) { + rc = lck_rc; + goto bailout; + } + + /* Set the position in files outside of the data to avoid corruption + * due to erroneous use of file descriptors in the application code. */ + mdbx_fseek(env->me_lfd, UINT64_C(1) << 63); + mdbx_fseek(env->me_lazy_fd, UINT64_C(1) << 63); + if (env->me_dsync_fd != INVALID_HANDLE_VALUE) + mdbx_fseek(env->me_dsync_fd, UINT64_C(1) << 63); + + const MDBX_env_flags_t rigorous_flags = + MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC; + const MDBX_env_flags_t mode_flags = rigorous_flags | MDBX_NOMETASYNC | + MDBX_LIFORECLAIM | MDBX_COALESCE | + MDBX_NORDAHEAD; + + MDBX_lockinfo *const lck = env->me_lck_mmap.lck; + if (lck && lck_rc != MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { + while (atomic_load32(&lck->mti_envmode, mo_AcquireRelease) == MDBX_RDONLY) { + if (atomic_cas32(&lck->mti_envmode, MDBX_RDONLY, + env->me_flags & mode_flags)) { + /* The case: + * - let's assume that for some reason the DB file is smaller + * than it should be according to the geometry, + * but not smaller than the last page used; + * - the first process that opens the database (lc_rc = true) + * does this in readonly mode and therefore cannot bring + * the file size back to normal; + * - some next process (lc_rc = false) opens the DB in read-write + * mode and now is here. + * + * FIXME: Should we re-check and set the size of DB-file right here? */ + break; + } + atomic_yield(); + } + + if (env->me_flags & MDBX_ACCEDE) { + /* pickup current mode-flags, including MDBX_LIFORECLAIM | + * MDBX_COALESCE | MDBX_NORDAHEAD */ + const unsigned diff = + (lck->mti_envmode.weak ^ env->me_flags) & mode_flags; + mdbx_notice("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, env->me_flags, + env->me_flags ^ diff); + env->me_flags ^= diff; + } + + if ((lck->mti_envmode.weak ^ env->me_flags) & rigorous_flags) { + mdbx_error("%s", "current mode/flags incompatible with requested"); + rc = MDBX_INCOMPATIBLE; + goto bailout; + } + } + + const int dxb_rc = mdbx_setup_dxb(env, lck_rc, mode); + if (MDBX_IS_ERROR(dxb_rc)) { + rc = dxb_rc; + goto bailout; + } + + if (unlikely(/* recovery mode */ env->me_stuck_meta >= 0) && + (lck_rc != /* exclusive */ MDBX_RESULT_TRUE || + (flags & MDBX_EXCLUSIVE) == 0)) { + mdbx_error("%s", "recovery requires exclusive mode"); + rc = MDBX_BUSY; + goto bailout; + } + + mdbx_debug("opened dbenv %p", (void *)env); + if (lck) { + if (lck_rc == MDBX_RESULT_TRUE) { + lck->mti_envmode.weak = env->me_flags & (mode_flags | MDBX_RDONLY); + rc = mdbx_lck_downgrade(env); + mdbx_debug("lck-downgrade-%s: rc %i", + (env->me_flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc); + if (rc != MDBX_SUCCESS) + goto bailout; + } else { + rc = mdbx_cleanup_dead_readers(env, false, NULL); + if (MDBX_IS_ERROR(rc)) + goto bailout; + } + + if ((env->me_flags & MDBX_NOTLS) == 0) { + rc = mdbx_rthc_alloc(&env->me_txkey, &lck->mti_readers[0], + &lck->mti_readers[env->me_maxreaders]); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + env->me_flags |= MDBX_ENV_TXKEY; + } + } + + if ((flags & MDBX_RDONLY) == 0) { + const size_t tsize = sizeof(MDBX_txn), + size = tsize + env->me_maxdbs * + (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + + sizeof(unsigned) + 1); + rc = mdbx_memalign_alloc( + env->me_os_psize, + env->me_psize * (1 /* page buffer */ + 1 /* page killer buffer */), + &env->me_pbuf); + if (rc == MDBX_SUCCESS) { + memset(env->me_pbuf, -1, env->me_psize * 2); + MDBX_txn *txn = mdbx_calloc(1, size); + if (txn) { + txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); + txn->tw.cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbiseqs = (unsigned *)(txn->tw.cursors + env->me_maxdbs); + txn->mt_dbistate = (uint8_t *)(txn->mt_dbiseqs + env->me_maxdbs); + txn->mt_env = env; + txn->mt_dbxs = env->me_dbxs; + txn->mt_flags = MDBX_TXN_FINISHED; + env->me_txn0 = txn; + txn->tw.retired_pages = mdbx_pnl_alloc(MDBX_PNL_INITIAL); + txn->tw.reclaimed_pglist = mdbx_pnl_alloc(MDBX_PNL_INITIAL); + if (unlikely(!txn->tw.retired_pages || !txn->tw.reclaimed_pglist)) + rc = MDBX_ENOMEM; + } else + rc = MDBX_ENOMEM; + } + } + +#if MDBX_DEBUG + if (rc == MDBX_SUCCESS) { + MDBX_meta *meta = mdbx_meta_head(env); + MDBX_db *db = &meta->mm_dbs[MAIN_DBI]; + + mdbx_debug("opened database version %u, pagesize %u", + (uint8_t)unaligned_peek_u64(4, meta->mm_magic_and_version), + env->me_psize); + mdbx_debug("using meta page %" PRIaPGNO ", txn %" PRIaTXN, + data_page(meta)->mp_pgno, mdbx_meta_txnid_fluid(env, meta)); + mdbx_debug("depth: %u", db->md_depth); + mdbx_debug("entries: %" PRIu64, db->md_entries); + mdbx_debug("branch pages: %" PRIaPGNO, db->md_branch_pages); + mdbx_debug("leaf pages: %" PRIaPGNO, db->md_leaf_pages); + mdbx_debug("overflow pages: %" PRIaPGNO, db->md_overflow_pages); + mdbx_debug("root: %" PRIaPGNO, db->md_root); + mdbx_debug("schema_altered: %" PRIaTXN, db->md_mod_txnid); + } +#endif + +bailout: + if (rc != MDBX_SUCCESS) { + rc = mdbx_env_close0(env) ? MDBX_PANIC : rc; + env->me_flags = + saved_me_flags | ((rc != MDBX_PANIC) ? 0 : MDBX_FATAL_ERROR); + } else { +#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) + mdbx_txn_valgrind(env, nullptr); +#endif + } + mdbx_free(env_pathname.buffer_for_free); + return rc; +} + +/* Destroy resources from mdbx_env_open(), clear our readers & DBIs */ +static __cold int mdbx_env_close0(MDBX_env *env) { + env->me_stuck_meta = -1; + if (!(env->me_flags & MDBX_ENV_ACTIVE)) { + mdbx_ensure(env, env->me_lcklist_next == nullptr); + return MDBX_SUCCESS; + } + + env->me_flags &= ~MDBX_ENV_ACTIVE; + env->me_lck = nullptr; + if (env->me_flags & MDBX_ENV_TXKEY) + mdbx_rthc_remove(env->me_txkey); + + lcklist_lock(); + const int rc = lcklist_detach_locked(env); + lcklist_unlock(); + + if (env->me_map) { + mdbx_munmap(&env->me_dxb_mmap); +#ifdef MDBX_USE_VALGRIND + VALGRIND_DISCARD(env->me_valgrind_handle); + env->me_valgrind_handle = -1; +#endif + } + + if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { + (void)mdbx_closefile(env->me_dsync_fd); + env->me_dsync_fd = INVALID_HANDLE_VALUE; + } + + if (env->me_lazy_fd != INVALID_HANDLE_VALUE) { + (void)mdbx_closefile(env->me_lazy_fd); + env->me_lazy_fd = INVALID_HANDLE_VALUE; + } + + if (env->me_lck_mmap.lck) + mdbx_munmap(&env->me_lck_mmap); + + if (env->me_lfd != INVALID_HANDLE_VALUE) { + (void)mdbx_closefile(env->me_lfd); + env->me_lfd = INVALID_HANDLE_VALUE; + } + + if (env->me_dbxs) { + for (unsigned i = env->me_numdbs; --i >= CORE_DBS;) + mdbx_free(env->me_dbxs[i].md_name.iov_base); + mdbx_free(env->me_dbxs); + } + mdbx_memalign_free(env->me_pbuf); + mdbx_free(env->me_dbiseqs); + mdbx_free(env->me_dbflags); + mdbx_free(env->me_pathname); + if (env->me_txn0) { + mdbx_dpl_free(env->me_txn0); + mdbx_txl_free(env->me_txn0->tw.lifo_reclaimed); + mdbx_pnl_free(env->me_txn0->tw.retired_pages); + mdbx_pnl_free(env->me_txn0->tw.spill_pages); + mdbx_pnl_free(env->me_txn0->tw.reclaimed_pglist); + mdbx_free(env->me_txn0); + } + env->me_flags = 0; + return rc; +} + +__cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { + MDBX_page *dp; + int rc = MDBX_SUCCESS; + + if (unlikely(!env)) + return MDBX_EINVAL; + + if (unlikely(env->me_signature.weak != MDBX_ME_SIGNATURE)) + return MDBX_EBADSIGN; + +#if MDBX_ENV_CHECKPID || !(defined(_WIN32) || defined(_WIN64)) + /* Check the PID even if MDBX_ENV_CHECKPID=0 on non-Windows + * platforms (i.e. where fork() is available). + * This is required to legitimize a call after fork() + * from a child process, that should be allowed to free resources. */ + if (unlikely(env->me_pid != mdbx_getpid())) + env->me_flags |= MDBX_FATAL_ERROR; +#endif /* MDBX_ENV_CHECKPID */ + + if (env->me_map && (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0 && + env->me_txn0) { + if (env->me_txn0->mt_owner && env->me_txn0->mt_owner != mdbx_thread_self()) + return MDBX_BUSY; + } else + dont_sync = true; + + if (!atomic_cas32(&env->me_signature, MDBX_ME_SIGNATURE, 0)) + return MDBX_EBADSIGN; + + if (!dont_sync) { +#if defined(_WIN32) || defined(_WIN64) + /* On windows, without blocking is impossible to determine whether another + * process is running a writing transaction or not. + * Because in the "owner died" condition kernel don't release + * file lock immediately. */ + rc = mdbx_env_sync_internal(env, true, false); + rc = (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; +#else + struct stat st; + if (unlikely(fstat(env->me_lazy_fd, &st))) + rc = errno; + else if (st.st_nlink > 0 /* don't sync deleted files */) { + rc = mdbx_env_sync_internal(env, true, true); + rc = (rc == MDBX_BUSY || rc == EAGAIN || rc == EACCES || rc == EBUSY || + rc == EWOULDBLOCK || rc == MDBX_RESULT_TRUE) + ? MDBX_SUCCESS + : rc; + } +#endif + } + + mdbx_assert(env, env->me_signature.weak == 0); + rc = mdbx_env_close0(env) ? MDBX_PANIC : rc; + mdbx_ensure(env, mdbx_fastmutex_destroy(&env->me_dbi_lock) == MDBX_SUCCESS); +#if defined(_WIN32) || defined(_WIN64) + /* me_remap_guard don't have destructor (Slim Reader/Writer Lock) */ + DeleteCriticalSection(&env->me_windowsbug_lock); +#else + mdbx_ensure(env, + mdbx_fastmutex_destroy(&env->me_remap_guard) == MDBX_SUCCESS); +#endif /* Windows */ + +#if MDBX_LOCKING > MDBX_LOCKING_SYSV + MDBX_lockinfo *const stub = lckless_stub(env); + mdbx_ensure(env, mdbx_ipclock_destroy(&stub->mti_wlock) == 0); +#endif /* MDBX_LOCKING */ + + while ((dp = env->me_dp_reserve) != NULL) { + ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); + VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); + env->me_dp_reserve = dp->mp_next; + mdbx_free(dp); + } + VALGRIND_DESTROY_MEMPOOL(env); + mdbx_ensure(env, env->me_lcklist_next == nullptr); + env->me_pid = 0; + mdbx_free(env); + + return rc; +} + +#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API +__cold int mdbx_env_close(MDBX_env *env) { + return __inline_mdbx_env_close(env); +} +#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ + +/* Compare two items pointing at aligned unsigned int's. */ +static int __hot cmp_int_align4(const MDBX_val *a, const MDBX_val *b) { + mdbx_assert(NULL, a->iov_len == b->iov_len); + switch (a->iov_len) { + case 4: + return CMP2INT(unaligned_peek_u32(4, a->iov_base), + unaligned_peek_u32(4, b->iov_base)); + case 8: + return CMP2INT(unaligned_peek_u64(4, a->iov_base), + unaligned_peek_u64(4, b->iov_base)); + default: + mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__, + __LINE__); + return 0; + } +} + +/* Compare two items pointing at 2-byte aligned unsigned int's. */ +static int __hot cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { + mdbx_assert(NULL, a->iov_len == b->iov_len); + switch (a->iov_len) { + case 4: + return CMP2INT(unaligned_peek_u32(2, a->iov_base), + unaligned_peek_u32(2, b->iov_base)); + case 8: + return CMP2INT(unaligned_peek_u64(2, a->iov_base), + unaligned_peek_u64(2, b->iov_base)); + default: + mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__, + __LINE__); + return 0; + } +} + +/* Compare two items pointing at unsigned values with unknown alignment. + * + * This is also set as MDBX_INTEGERDUP|MDBX_DUPFIXED's MDBX_dbx.md_dcmp. */ +static int __hot cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) { + mdbx_assert(NULL, a->iov_len == b->iov_len); + switch (a->iov_len) { + case 4: + return CMP2INT(unaligned_peek_u32(1, a->iov_base), + unaligned_peek_u32(1, b->iov_base)); + case 8: + return CMP2INT(unaligned_peek_u64(1, a->iov_base), + unaligned_peek_u64(1, b->iov_base)); + default: + mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__, + __LINE__); + return 0; + } +} + +/* Compare two items lexically */ +static int __hot cmp_lexical(const MDBX_val *a, const MDBX_val *b) { + if (a->iov_len == b->iov_len) + return memcmp(a->iov_base, b->iov_base, a->iov_len); + + const int diff_len = (a->iov_len < b->iov_len) ? -1 : 1; + const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; + int diff_data = memcmp(a->iov_base, b->iov_base, shortest); + return likely(diff_data) ? diff_data : diff_len; +} + +/* Compare two items in reverse byte order */ +static int __hot cmp_reverse(const MDBX_val *a, const MDBX_val *b) { + const uint8_t *pa = (const uint8_t *)a->iov_base + a->iov_len; + const uint8_t *pb = (const uint8_t *)b->iov_base + b->iov_len; + const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; + + const uint8_t *const end = pa - shortest; + while (pa != end) { + int diff = *--pa - *--pb; + if (likely(diff)) + return diff; + } + return CMP2INT(a->iov_len, b->iov_len); +} + +/* Fast non-lexically comparator */ +static int __hot cmp_lenfast(const MDBX_val *a, const MDBX_val *b) { + int diff = CMP2INT(a->iov_len, b->iov_len); + return likely(diff) ? diff : memcmp(a->iov_base, b->iov_base, a->iov_len); +} + +static bool unsure_equal(MDBX_cmp_func cmp, const MDBX_val *a, + const MDBX_val *b) { + /* checking for the use of a known good comparator + * or/otherwise for a full byte-to-byte match */ + return cmp == cmp_lenfast || cmp == cmp_lexical || cmp == cmp_reverse || + cmp == cmp_int_unaligned || cmp_lenfast(a, b) == 0; +} + +/* Search for key within a page, using binary search. + * Returns the smallest entry larger or equal to the key. + * Updates the cursor index with the index of the found entry. + * If no entry larger or equal to the key is found, returns NULL. */ +static struct node_result __hot mdbx_node_search(MDBX_cursor *mc, + const MDBX_val *key) { + MDBX_page *mp = mc->mc_pg[mc->mc_top]; + const int nkeys = page_numkeys(mp); + DKBUF_DEBUG; + + mdbx_debug("searching %u keys in %s %spage %" PRIaPGNO, nkeys, + IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", + mp->mp_pgno); + + struct node_result ret; + ret.exact = false; + STATIC_ASSERT(P_BRANCH == 1); + int low = mp->mp_flags & P_BRANCH; + int high = nkeys - 1; + if (unlikely(high < low)) { + mc->mc_ki[mc->mc_top] = 0; + ret.node = NULL; + return ret; + } + + int cr = 0, i = 0; + MDBX_cmp_func *cmp = mc->mc_dbx->md_cmp; + MDBX_val nodekey; + if (unlikely(IS_LEAF2(mp))) { + mdbx_cassert(mc, mp->mp_leaf2_ksize == mc->mc_db->md_xsize); + nodekey.iov_len = mp->mp_leaf2_ksize; + do { + i = (low + high) >> 1; + nodekey.iov_base = page_leaf2key(mp, i, nodekey.iov_len); + mdbx_cassert(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= + (char *)nodekey.iov_base + nodekey.iov_len); + cr = cmp(key, &nodekey); + mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey), + cr); + if (unlikely(cr == 0)) { + ret.exact = true; + break; + } + low = (cr < 0) ? low : i + 1; + high = (cr < 0) ? i - 1 : high; + } while (likely(low <= high)); + + /* Found entry is less than the key. */ + /* Skip to get the smallest entry larger than key. */ + i += cr > 0; + + /* store the key index */ + mc->mc_ki[mc->mc_top] = (indx_t)i; + ret.node = (i < nkeys) + ? /* fake for LEAF2 */ (MDBX_node *)(intptr_t)-1 + : /* There is no entry larger or equal to the key. */ NULL; + return ret; + } + + if (IS_BRANCH(mp) && cmp == cmp_int_align2) + /* Branch pages have no data, so if using integer keys, + * alignment is guaranteed. Use faster cmp_int_align4(). */ + cmp = cmp_int_align4; + + MDBX_node *node; + do { + i = (low + high) >> 1; + + node = page_node(mp, i); + nodekey.iov_len = node_ks(node); + nodekey.iov_base = node_key(node); + mdbx_cassert(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= + (char *)nodekey.iov_base + nodekey.iov_len); + + cr = cmp(key, &nodekey); + if (IS_LEAF(mp)) + mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey), + cr); + else + mdbx_debug("found branch index %u [%s -> %" PRIaPGNO "], rc = %i", i, + DKEY_DEBUG(&nodekey), node_pgno(node), cr); + if (unlikely(cr == 0)) { + ret.exact = true; + break; + } + low = (cr < 0) ? low : i + 1; + high = (cr < 0) ? i - 1 : high; + } while (likely(low <= high)); + + /* Found entry is less than the key. */ + /* Skip to get the smallest entry larger than key. */ + i += cr > 0; + + /* store the key index */ + mc->mc_ki[mc->mc_top] = (indx_t)i; + ret.node = (i < nkeys) + ? page_node(mp, i) + : /* There is no entry larger or equal to the key. */ NULL; + return ret; +} + +/* Pop a page off the top of the cursor's stack. */ +static __inline void mdbx_cursor_pop(MDBX_cursor *mc) { + if (mc->mc_snum) { + mdbx_debug("popped page %" PRIaPGNO " off db %d cursor %p", + mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *)mc); + if (--mc->mc_snum) { + mc->mc_top--; + } else { + mc->mc_flags &= ~C_INITIALIZED; + } + } +} + +/* Push a page onto the top of the cursor's stack. + * Set MDBX_TXN_ERROR on failure. */ +static __inline int mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp) { + mdbx_debug("pushing page %" PRIaPGNO " on db %d cursor %p", mp->mp_pgno, + DDBI(mc), (void *)mc); + + if (unlikely(mc->mc_snum >= CURSOR_STACK)) { + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_CURSOR_FULL; + } + + mdbx_cassert(mc, mc->mc_snum < UINT16_MAX); + mc->mc_top = mc->mc_snum++; + mc->mc_pg[mc->mc_top] = mp; + mc->mc_ki[mc->mc_top] = 0; + + return MDBX_SUCCESS; +} + +__hot static struct page_result +mdbx_page_get_ex(MDBX_cursor *const mc, const pgno_t pgno, + /* TODO: use parent-page ptr */ txnid_t front) { + struct page_result ret; + MDBX_txn *const txn = mc->mc_txn; + mdbx_tassert(txn, front <= txn->mt_front); + if (unlikely(pgno >= txn->mt_next_pgno)) { + mdbx_error("page #%" PRIaPGNO " beyond next-pgno", pgno); + ret.page = nullptr; + corrupted: + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; + ret.err = MDBX_PAGE_NOTFOUND; + return ret; + } + + MDBX_env *const env = txn->mt_env; + mdbx_assert(env, ((txn->mt_flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + if (unlikely((txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0)) { + const MDBX_txn *spiller = txn; + do { + /* Spilled pages were dirtied in this txn and flushed + * because the dirty list got full. Bring this page + * back in from the map (but don't unspill it here, + * leave that unless page_touch happens again). */ + if (unlikely(spiller->mt_flags & MDBX_TXN_SPILLS) && + spiller->tw.spill_pages && + mdbx_pnl_exist(spiller->tw.spill_pages, pgno << 1)) { + goto spilled; + } + + const unsigned i = mdbx_dpl_search(spiller, pgno); + assert((int)i > 0); + if (spiller->tw.dirtylist->items[i].pgno == pgno) { + ret.page = spiller->tw.dirtylist->items[i].ptr; + spiller->tw.dirtylist->items[i].lru = txn->tw.dirtylru++; + goto dirty; + } + + spiller = spiller->mt_parent; + } while (spiller != NULL); + } + +spilled: + ret.page = pgno2page(env, pgno); + +dirty: + if (unlikely(ret.page->mp_pgno != pgno)) { + bad_page(ret.page, + "mismatch actual pgno (%" PRIaPGNO ") != expected (%" PRIaPGNO + ")\n", + ret.page->mp_pgno, pgno); + goto corrupted; + } + +#if !MDBX_DISABLE_PAGECHECKS + if (unlikely(ret.page->mp_flags & P_ILL_BITS)) { + bad_page(ret.page, "invalid page's flags (%u)\n", ret.page->mp_flags); + goto corrupted; + } + + if (unlikely(ret.page->mp_txnid > front) && + (ret.page->mp_txnid > txn->mt_front || front < txn->mt_txnid)) { + bad_page(ret.page, + "invalid page txnid (%" PRIaTXN ") for %s' txnid (%" PRIaTXN ")\n", + ret.page->mp_txnid, + (front == txn->mt_front && front != txn->mt_txnid) ? "front-txn" + : "parent-page", + front); + goto corrupted; + } + + if (unlikely((ret.page->mp_upper < ret.page->mp_lower || + ((ret.page->mp_lower | ret.page->mp_upper) & 1) || + PAGEHDRSZ + ret.page->mp_upper > env->me_psize) && + !IS_OVERFLOW(ret.page))) { + bad_page(ret.page, "invalid page lower(%u)/upper(%u) with limit (%u)\n", + ret.page->mp_lower, ret.page->mp_upper, page_space(env)); + goto corrupted; + } +#endif /* !MDBX_DISABLE_PAGECHECKS */ + + ret.err = MDBX_SUCCESS; + if (mdbx_audit_enabled()) + ret.err = mdbx_page_check(mc, ret.page, C_UPDATING); + return ret; +} + +/* Finish mdbx_page_search() / mdbx_page_search_lowest(). + * The cursor is at the root page, set up the rest of it. */ +__hot static int mdbx_page_search_root(MDBX_cursor *mc, const MDBX_val *key, + int flags) { + MDBX_page *mp = mc->mc_pg[mc->mc_top]; + int rc; + DKBUF_DEBUG; + + while (IS_BRANCH(mp)) { + MDBX_node *node; + int i; + + mdbx_debug("branch page %" PRIaPGNO " has %u keys", mp->mp_pgno, + page_numkeys(mp)); + /* Don't assert on branch pages in the GC. We can get here + * while in the process of rebalancing a GC branch page; we must + * let that proceed. ITS#8336 */ + mdbx_cassert(mc, !mc->mc_dbi || page_numkeys(mp) > 1); + mdbx_debug("found index 0 to page %" PRIaPGNO, node_pgno(page_node(mp, 0))); + + if (flags & (MDBX_PS_FIRST | MDBX_PS_LAST)) { + i = 0; + if (flags & MDBX_PS_LAST) { + i = page_numkeys(mp) - 1; + /* if already init'd, see if we're already in right place */ + if (mc->mc_flags & C_INITIALIZED) { + if (mc->mc_ki[mc->mc_top] == i) { + mc->mc_top = mc->mc_snum++; + mp = mc->mc_pg[mc->mc_top]; + goto ready; + } + } + } + } else { + const struct node_result nsr = mdbx_node_search(mc, key); + if (nsr.node) + i = mc->mc_ki[mc->mc_top] + nsr.exact - 1; + else + i = page_numkeys(mp) - 1; + mdbx_debug("following index %u for key [%s]", i, DKEY_DEBUG(key)); + } + + mdbx_cassert(mc, i >= 0 && i < (int)page_numkeys(mp)); + node = page_node(mp, i); + + if (unlikely((rc = mdbx_page_get(mc, node_pgno(node), &mp, + pp_txnid4chk(mp, mc->mc_txn))) != 0)) + return rc; + + mc->mc_ki[mc->mc_top] = (indx_t)i; + if (unlikely(rc = mdbx_cursor_push(mc, mp))) + return rc; + + ready: + if (flags & MDBX_PS_MODIFY) { + if (unlikely((rc = mdbx_page_touch(mc)) != 0)) + return rc; + mp = mc->mc_pg[mc->mc_top]; + } + } + +#if !MDBX_DISABLE_PAGECHECKS + if (unlikely(!IS_LEAF(mp))) { + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; + return bad_page(mp, "index points to a page with 0x%02x flags\n", + mp->mp_flags); + } +#endif /* !MDBX_DISABLE_PAGECHECKS */ + + mdbx_debug("found leaf page %" PRIaPGNO " for key [%s]", mp->mp_pgno, + DKEY_DEBUG(key)); + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + return MDBX_SUCCESS; +} + +static int mdbx_setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, + const unsigned pagesize) { + if (unlikely(!dbx->md_cmp)) { + dbx->md_cmp = get_default_keycmp(db->md_flags); + dbx->md_dcmp = get_default_datacmp(db->md_flags); + } + + dbx->md_klen_min = + (db->md_flags & MDBX_INTEGERKEY) ? 4 /* sizeof(uint32_t) */ : 0; + dbx->md_klen_max = keysize_max(pagesize, db->md_flags); + assert(dbx->md_klen_max != (unsigned)-1); + + dbx->md_vlen_min = (db->md_flags & MDBX_INTEGERDUP) + ? 4 /* sizeof(uint32_t) */ + : ((db->md_flags & MDBX_DUPFIXED) ? 1 : 0); + dbx->md_vlen_max = valsize_max(pagesize, db->md_flags); + assert(dbx->md_vlen_max != (unsigned)-1); + + if ((db->md_flags & (MDBX_DUPFIXED | MDBX_INTEGERDUP)) != 0 && db->md_xsize) { + if (!MDBX_DISABLE_PAGECHECKS && unlikely(db->md_xsize < dbx->md_vlen_min || + db->md_xsize > dbx->md_vlen_max)) { + mdbx_error("db.md_xsize (%u) <> min/max value-length (%zu/%zu)", + db->md_xsize, dbx->md_vlen_min, dbx->md_vlen_max); + return MDBX_CORRUPTED; + } + dbx->md_vlen_min = dbx->md_vlen_max = db->md_xsize; + } + return MDBX_SUCCESS; +} + +static int mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { + MDBX_cursor_couple couple; + if (unlikely(TXN_DBI_CHANGED(txn, dbi))) + return MDBX_BAD_DBI; + int rc = mdbx_cursor_init(&couple.outer, txn, MAIN_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + MDBX_dbx *const dbx = &txn->mt_dbxs[dbi]; + rc = mdbx_page_search(&couple.outer, &dbx->md_name, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return (rc == MDBX_NOTFOUND) ? MDBX_BAD_DBI : rc; + + MDBX_val data; + struct node_result nsr = mdbx_node_search(&couple.outer, &dbx->md_name); + if (unlikely(!nsr.exact)) + return MDBX_BAD_DBI; + if (unlikely((node_flags(nsr.node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) + return MDBX_INCOMPATIBLE; /* not a named DB */ + + const txnid_t pp_txnid = + pp_txnid4chk(couple.outer.mc_pg[couple.outer.mc_top], txn); + rc = mdbx_node_read(&couple.outer, nsr.node, &data, pp_txnid); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(data.iov_len != sizeof(MDBX_db))) + return MDBX_INCOMPATIBLE; /* not a named DB */ + + uint16_t md_flags = UNALIGNED_PEEK_16(data.iov_base, MDBX_db, md_flags); + /* The txn may not know this DBI, or another process may + * have dropped and recreated the DB with other flags. */ + MDBX_db *const db = &txn->mt_dbs[dbi]; + if (unlikely((db->md_flags & DB_PERSISTENT_FLAGS) != md_flags)) + return MDBX_INCOMPATIBLE; + + memcpy(db, data.iov_base, sizeof(MDBX_db)); +#if !MDBX_DISABLE_PAGECHECKS + mdbx_tassert(txn, txn->mt_front >= pp_txnid); + if (unlikely(db->md_mod_txnid > pp_txnid)) { + mdbx_error("db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", + db->md_mod_txnid, pp_txnid); + return MDBX_CORRUPTED; + } +#endif /* !MDBX_DISABLE_PAGECHECKS */ + rc = mdbx_setup_dbx(dbx, db, txn->mt_env->me_psize); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + txn->mt_dbistate[dbi] &= ~DBI_STALE; + return MDBX_SUCCESS; +} + +/* Search for the lowest key under the current branch page. + * This just bypasses a numkeys check in the current page + * before calling mdbx_page_search_root(), because the callers + * are all in situations where the current page is known to + * be underfilled. */ +__hot static int mdbx_page_search_lowest(MDBX_cursor *mc) { + MDBX_page *mp = mc->mc_pg[mc->mc_top]; + mdbx_cassert(mc, IS_BRANCH(mp)); + MDBX_node *node = page_node(mp, 0); + int rc; + + if (unlikely((rc = mdbx_page_get(mc, node_pgno(node), &mp, + pp_txnid4chk(mp, mc->mc_txn))) != 0)) + return rc; + + mc->mc_ki[mc->mc_top] = 0; + if (unlikely(rc = mdbx_cursor_push(mc, mp))) + return rc; + return mdbx_page_search_root(mc, NULL, MDBX_PS_FIRST); +} + +/* Search for the page a given key should be in. + * Push it and its parent pages on the cursor stack. + * + * [in,out] mc the cursor for this operation. + * [in] key the key to search for, or NULL for first/last page. + * [in] flags If MDBX_PS_MODIFY is set, visited pages in the DB + * are touched (updated with new page numbers). + * If MDBX_PS_FIRST or MDBX_PS_LAST is set, find first or last + * leaf. + * This is used by mdbx_cursor_first() and mdbx_cursor_last(). + * If MDBX_PS_ROOTONLY set, just fetch root node, no further + * lookups. + * + * Returns 0 on success, non-zero on failure. */ +__hot static int mdbx_page_search(MDBX_cursor *mc, const MDBX_val *key, + int flags) { + int rc; + pgno_t root; + + /* Make sure the txn is still viable, then find the root from + * the txn's db table and set it as the root of the cursor's stack. */ + if (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_BLOCKED)) { + mdbx_debug("%s", "transaction has failed, must abort"); + return MDBX_BAD_TXN; + } + + /* Make sure we're using an up-to-date root */ + if (unlikely(*mc->mc_dbistate & DBI_STALE)) { + rc = mdbx_fetch_sdb(mc->mc_txn, mc->mc_dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + root = mc->mc_db->md_root; + + if (unlikely(root == P_INVALID)) { /* Tree is empty. */ + mdbx_debug("%s", "tree is empty"); + return MDBX_NOTFOUND; + } + + mdbx_cassert(mc, root >= NUM_METAS); + if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) { + txnid_t pp_txnid = mc->mc_db->md_mod_txnid; + pp_txnid = /* mc->mc_db->md_mod_txnid maybe zero in a legacy DB */ pp_txnid + ? pp_txnid + : mc->mc_txn->mt_txnid; + MDBX_txn *scan = mc->mc_txn; + do + if (scan->mt_dbistate[mc->mc_dbi] & DBI_DIRTY) { + pp_txnid = scan->mt_front; + break; + } + while ((scan = scan->mt_parent) != nullptr); + if (unlikely((rc = mdbx_page_get(mc, root, &mc->mc_pg[0], pp_txnid) != 0))) + return rc; + } + + mc->mc_snum = 1; + mc->mc_top = 0; + + mdbx_debug("db %d root page %" PRIaPGNO " has flags 0x%X", DDBI(mc), root, + mc->mc_pg[0]->mp_flags); + + if (flags & MDBX_PS_MODIFY) { + if (!(*mc->mc_dbistate & DBI_DIRTY) && unlikely(rc = mdbx_touch_dbi(mc))) + return rc; + if (unlikely(rc = mdbx_page_touch(mc))) + return rc; + } + + if (flags & MDBX_PS_ROOTONLY) + return MDBX_SUCCESS; + + return mdbx_page_search_root(mc, key, flags); +} + +/* Return the data associated with a given node. + * + * [in] mc The cursor for this operation. + * [in] leaf The node being read. + * [out] data Updated to point to the node's data. + * + * Returns 0 on success, non-zero on failure. */ +static __always_inline int mdbx_node_read(MDBX_cursor *mc, MDBX_node *node, + MDBX_val *data, const txnid_t front) { + data->iov_len = node_ds(node); + data->iov_base = node_data(node); + if (unlikely(F_ISSET(node_flags(node), F_BIGDATA))) { + /* Read overflow data. */ + MDBX_page *omp; /* overflow page */ + int rc = mdbx_page_get(mc, node_largedata_pgno(node), &omp, front); + if (unlikely((rc != MDBX_SUCCESS))) { + mdbx_debug("read overflow page %" PRIaPGNO " failed", + node_largedata_pgno(node)); + return rc; + } + data->iov_base = page_data(omp); + } + return MDBX_SUCCESS; +} + +int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data) { + DKBUF_DEBUG; + mdbx_debug("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); + + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!key || !data)) + return MDBX_EINVAL; + + if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) + return MDBX_BAD_DBI; + + MDBX_cursor_couple cx; + rc = mdbx_cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + return mdbx_cursor_set(&cx.outer, (MDBX_val *)key, data, MDBX_SET).err; +} + +int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, + MDBX_val *data) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!key || !data)) + return MDBX_EINVAL; + + if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) + return MDBX_BAD_DBI; + + if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) + return MDBX_BAD_TXN; + + MDBX_cursor_couple cx; + rc = mdbx_cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + return mdbx_cursor_get(&cx.outer, key, data, MDBX_SET_LOWERBOUND); +} + +int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, + size_t *values_count) { + DKBUF_DEBUG; + mdbx_debug("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); + + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!key || !data)) + return MDBX_EINVAL; + + if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) + return MDBX_BAD_DBI; + + MDBX_cursor_couple cx; + rc = mdbx_cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + rc = mdbx_cursor_set(&cx.outer, key, data, MDBX_SET_KEY).err; + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc == MDBX_NOTFOUND && values_count) + *values_count = 0; + return rc; + } + + if (values_count) { + *values_count = 1; + if (cx.outer.mc_xcursor != NULL) { + MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], + cx.outer.mc_ki[cx.outer.mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { + mdbx_tassert(txn, cx.outer.mc_xcursor == &cx.inner && + (cx.inner.mx_cursor.mc_flags & C_INITIALIZED)); + *values_count = + (sizeof(*values_count) >= sizeof(cx.inner.mx_db.md_entries) || + cx.inner.mx_db.md_entries <= PTRDIFF_MAX) + ? (size_t)cx.inner.mx_db.md_entries + : PTRDIFF_MAX; + } + } + } + return MDBX_SUCCESS; +} + +/* Find a sibling for a page. + * Replaces the page at the top of the cursor's stack with the specified + * sibling, if one exists. + * + * [in] mc The cursor for this operation. + * [in] dir SIBLING_LEFT or SIBLING_RIGHT. + * + * Returns 0 on success, non-zero on failure. */ +static int mdbx_cursor_sibling(MDBX_cursor *mc, int dir) { + int rc; + MDBX_node *node; + MDBX_page *mp; + assert(dir == SIBLING_LEFT || dir == SIBLING_RIGHT); + + if (unlikely(mc->mc_snum < 2)) + return MDBX_NOTFOUND; /* root has no siblings */ + + mdbx_cursor_pop(mc); + mdbx_debug("parent page is page %" PRIaPGNO ", index %u", + mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); + + if ((dir == SIBLING_RIGHT) + ? (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mc->mc_pg[mc->mc_top])) + : (mc->mc_ki[mc->mc_top] == 0)) { + mdbx_debug("no more keys aside, moving to next %s sibling", + dir ? "right" : "left"); + if (unlikely((rc = mdbx_cursor_sibling(mc, dir)) != MDBX_SUCCESS)) { + /* undo cursor_pop before returning */ + mc->mc_top++; + mc->mc_snum++; + return rc; + } + } else { + assert((dir - 1) == -1 || (dir - 1) == 1); + mc->mc_ki[mc->mc_top] += (indx_t)(dir - 1); + mdbx_debug("just moving to %s index key %u", + (dir == SIBLING_RIGHT) ? "right" : "left", + mc->mc_ki[mc->mc_top]); + } + mdbx_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); + + node = page_node(mp = mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (unlikely((rc = mdbx_page_get(mc, node_pgno(node), &mp, + pp_txnid4chk(mp, mc->mc_txn))) != 0)) { + /* mc will be inconsistent if caller does mc_snum++ as above */ + mc->mc_flags &= ~(C_INITIALIZED | C_EOF); + return rc; + } + + rc = mdbx_cursor_push(mc, mp); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + mc->mc_ki[mc->mc_top] = + (indx_t)((dir == SIBLING_LEFT) ? page_numkeys(mp) - 1 : 0); + return MDBX_SUCCESS; +} + +/* Move the cursor to the next data item. */ +static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { + MDBX_page *mp; + MDBX_node *node; + int rc; + + if ((mc->mc_flags & C_DEL) && op == MDBX_NEXT_DUP) + return MDBX_NOTFOUND; + + if (!(mc->mc_flags & C_INITIALIZED)) + return mdbx_cursor_first(mc, key, data); + + mp = mc->mc_pg[mc->mc_top]; + if (mc->mc_flags & C_EOF) { + if (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mp)) + return MDBX_NOTFOUND; + mc->mc_flags ^= C_EOF; + } + + if (mc->mc_db->md_flags & MDBX_DUPSORT) { + node = page_node(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (op == MDBX_NEXT || op == MDBX_NEXT_DUP) { + rc = + mdbx_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_NEXT); + if (op != MDBX_NEXT || rc != MDBX_NOTFOUND) { + if (likely(rc == MDBX_SUCCESS)) + get_key_optional(node, key); + return rc; + } + } + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + if (op == MDBX_NEXT_DUP) + return MDBX_NOTFOUND; + } + } + + mdbx_debug("cursor_next: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, + (void *)mc); + if (mc->mc_flags & C_DEL) { + mc->mc_flags ^= C_DEL; + goto skip; + } + + int ki = mc->mc_ki[mc->mc_top]; + mc->mc_ki[mc->mc_top] = (indx_t)++ki; + const int numkeys = page_numkeys(mp); + if (unlikely(ki >= numkeys)) { + mdbx_debug("%s", "=====> move to next sibling page"); + mc->mc_ki[mc->mc_top] = (indx_t)(numkeys - 1); + if (unlikely((rc = mdbx_cursor_sibling(mc, SIBLING_RIGHT)) != + MDBX_SUCCESS)) { + mc->mc_flags |= C_EOF; + return rc; + } + mp = mc->mc_pg[mc->mc_top]; + mdbx_debug("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, + mc->mc_ki[mc->mc_top]); + } + +skip: + mdbx_debug("==> cursor points to page %" PRIaPGNO + " with %u keys, key index %u", + mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); + + if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mp))) + return MDBX_CORRUPTED; + + if (IS_LEAF2(mp)) { + if (likely(key)) { + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); + } + return MDBX_SUCCESS; + } + + node = page_node(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { + rc = mdbx_xcursor_init1(mc, node, mp); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } else if (likely(data)) { + if (unlikely((rc = mdbx_node_read(mc, node, data, + pp_txnid4chk(mp, mc->mc_txn))) != + MDBX_SUCCESS)) + return rc; + } + + get_key_optional(node, key); + return MDBX_SUCCESS; +} + +/* Move the cursor to the previous data item. */ +static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { + MDBX_page *mp; + MDBX_node *node; + int rc; + + if ((mc->mc_flags & C_DEL) && op == MDBX_PREV_DUP) + return MDBX_NOTFOUND; + + if (!(mc->mc_flags & C_INITIALIZED)) { + rc = mdbx_cursor_last(mc, key, data); + if (unlikely(rc)) + return rc; + mc->mc_ki[mc->mc_top]++; + } + + mp = mc->mc_pg[mc->mc_top]; + if ((mc->mc_db->md_flags & MDBX_DUPSORT) && + mc->mc_ki[mc->mc_top] < page_numkeys(mp)) { + node = page_node(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (op == MDBX_PREV || op == MDBX_PREV_DUP) { + rc = + mdbx_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_PREV); + if (op != MDBX_PREV || rc != MDBX_NOTFOUND) { + if (likely(rc == MDBX_SUCCESS)) { + get_key_optional(node, key); + mc->mc_flags &= ~C_EOF; + } + return rc; + } + } + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + if (op == MDBX_PREV_DUP) + return MDBX_NOTFOUND; + } + } + + mdbx_debug("cursor_prev: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, + (void *)mc); + + mc->mc_flags &= ~(C_EOF | C_DEL); + + int ki = mc->mc_ki[mc->mc_top]; + mc->mc_ki[mc->mc_top] = (indx_t)--ki; + if (unlikely(ki < 0)) { + mc->mc_ki[mc->mc_top] = 0; + mdbx_debug("%s", "=====> move to prev sibling page"); + if ((rc = mdbx_cursor_sibling(mc, SIBLING_LEFT)) != MDBX_SUCCESS) + return rc; + mp = mc->mc_pg[mc->mc_top]; + mdbx_debug("prev page is %" PRIaPGNO ", key index %u", mp->mp_pgno, + mc->mc_ki[mc->mc_top]); + } + mdbx_debug("==> cursor points to page %" PRIaPGNO + " with %u keys, key index %u", + mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); + + if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mp))) + return MDBX_CORRUPTED; + + if (IS_LEAF2(mp)) { + if (likely(key)) { + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); + } + return MDBX_SUCCESS; + } + + node = page_node(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(node_flags(node), F_DUPDATA)) { + rc = mdbx_xcursor_init1(mc, node, mp); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } else if (likely(data)) { + if (unlikely((rc = mdbx_node_read(mc, node, data, + pp_txnid4chk(mp, mc->mc_txn))) != + MDBX_SUCCESS)) + return rc; + } + + get_key_optional(node, key); + return MDBX_SUCCESS; +} + +/* Set the cursor on a specific data item. */ +static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data, + MDBX_cursor_op op) { + MDBX_page *mp; + MDBX_node *node = NULL; + DKBUF_DEBUG; + + struct cursor_set_result ret; + ret.exact = false; + if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min || + key->iov_len > mc->mc_dbx->md_klen_max)) { + mdbx_cassert(mc, !"Invalid key-size"); + ret.err = MDBX_BAD_VALSIZE; + return ret; + } + + MDBX_val aligned_key = *key; + uint64_t aligned_keybytes; + if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { + switch (aligned_key.iov_len) { + default: + mdbx_cassert(mc, !"key-size is invalid for MDBX_INTEGERKEY"); + ret.err = MDBX_BAD_VALSIZE; + return ret; + case 4: + if (unlikely(3 & (uintptr_t)aligned_key.iov_base)) + /* copy instead of return error to avoid break compatibility */ + aligned_key.iov_base = + memcpy(&aligned_keybytes, aligned_key.iov_base, 4); + break; + case 8: + if (unlikely(7 & (uintptr_t)aligned_key.iov_base)) + /* copy instead of return error to avoid break compatibility */ + aligned_key.iov_base = + memcpy(&aligned_keybytes, aligned_key.iov_base, 8); + break; + } + } + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + + /* See if we're already on the right page */ + if (mc->mc_flags & C_INITIALIZED) { + MDBX_val nodekey; + + mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + mp = mc->mc_pg[mc->mc_top]; + if (unlikely(!page_numkeys(mp))) { + mc->mc_ki[mc->mc_top] = 0; + mc->mc_flags |= C_EOF; + ret.err = MDBX_NOTFOUND; + return ret; + } + if (IS_LEAF2(mp)) { + nodekey.iov_len = mc->mc_db->md_xsize; + nodekey.iov_base = page_leaf2key(mp, 0, nodekey.iov_len); + } else { + node = page_node(mp, 0); + get_key(node, &nodekey); + } + int cmp = mc->mc_dbx->md_cmp(&aligned_key, &nodekey); + if (unlikely(cmp == 0)) { + /* Probably happens rarely, but first node on the page + * was the one we wanted. */ + mc->mc_ki[mc->mc_top] = 0; + ret.exact = true; + mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); + goto got_node; + } + if (cmp > 0) { + const unsigned nkeys = page_numkeys(mp); + if (nkeys > 1) { + if (IS_LEAF2(mp)) { + nodekey.iov_base = page_leaf2key(mp, nkeys - 1, nodekey.iov_len); + } else { + node = page_node(mp, nkeys - 1); + get_key(node, &nodekey); + } + cmp = mc->mc_dbx->md_cmp(&aligned_key, &nodekey); + if (cmp == 0) { + /* last node was the one we wanted */ + mdbx_cassert(mc, nkeys >= 1 && nkeys <= UINT16_MAX + 1); + mc->mc_ki[mc->mc_top] = (indx_t)(nkeys - 1); + ret.exact = true; + mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); + goto got_node; + } + if (cmp < 0) { + if (mc->mc_ki[mc->mc_top] < page_numkeys(mp)) { + /* This is definitely the right page, skip search_page */ + if (IS_LEAF2(mp)) { + nodekey.iov_base = + page_leaf2key(mp, mc->mc_ki[mc->mc_top], nodekey.iov_len); + } else { + node = page_node(mp, mc->mc_ki[mc->mc_top]); + get_key(node, &nodekey); + } + cmp = mc->mc_dbx->md_cmp(&aligned_key, &nodekey); + if (cmp == 0) { + /* current node was the one we wanted */ + ret.exact = true; + mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); + goto got_node; + } + } + mc->mc_flags &= ~C_EOF; + goto search_node; + } + } + /* If any parents have right-sibs, search. + * Otherwise, there's nothing further. */ + unsigned i; + for (i = 0; i < mc->mc_top; i++) + if (mc->mc_ki[i] < page_numkeys(mc->mc_pg[i]) - 1) + break; + if (i == mc->mc_top) { + /* There are no other pages */ + mdbx_cassert(mc, nkeys <= UINT16_MAX); + mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; + mc->mc_flags |= C_EOF; + ret.err = MDBX_NOTFOUND; + return ret; + } + } + if (!mc->mc_top) { + /* There are no other pages */ + mc->mc_ki[mc->mc_top] = 0; + if (op == MDBX_SET_RANGE) + goto got_node; + + mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); + ret.err = MDBX_NOTFOUND; + return ret; + } + } else { + mc->mc_pg[0] = 0; + } + + ret.err = mdbx_page_search(mc, &aligned_key, 0); + if (unlikely(ret.err != MDBX_SUCCESS)) + return ret; + + mp = mc->mc_pg[mc->mc_top]; + mdbx_cassert(mc, IS_LEAF(mp)); + +search_node:; + struct node_result nsr = mdbx_node_search(mc, &aligned_key); + node = nsr.node; + ret.exact = nsr.exact; + if (!ret.exact) { + if (op != MDBX_SET_RANGE) { + /* MDBX_SET specified and not an exact match. */ + if (unlikely(mc->mc_ki[mc->mc_top] >= + page_numkeys(mc->mc_pg[mc->mc_top]))) + mc->mc_flags |= C_EOF; + ret.err = MDBX_NOTFOUND; + return ret; + } + + if (node == NULL) { + mdbx_debug("%s", "===> inexact leaf not found, goto sibling"); + ret.err = mdbx_cursor_sibling(mc, SIBLING_RIGHT); + if (unlikely(ret.err != MDBX_SUCCESS)) { + mc->mc_flags |= C_EOF; + return ret; /* no entries matched */ + } + mp = mc->mc_pg[mc->mc_top]; + mdbx_cassert(mc, IS_LEAF(mp)); + if (!IS_LEAF2(mp)) + node = page_node(mp, 0); + } + } + mdbx_cassert(mc, + mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); + +got_node: + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + if (IS_LEAF2(mp)) { + if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) { + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); + } + ret.err = MDBX_SUCCESS; + return ret; + } + + if (F_ISSET(node_flags(node), F_DUPDATA)) { + ret.err = mdbx_xcursor_init1(mc, node, mp); + if (unlikely(ret.err != MDBX_SUCCESS)) + return ret; + if (op == MDBX_SET || op == MDBX_SET_KEY || op == MDBX_SET_RANGE) { + ret.err = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(ret.err != MDBX_SUCCESS)) + return ret; + } else { + ret = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, + MDBX_SET_RANGE); + if (unlikely(ret.err != MDBX_SUCCESS)) + return ret; + } + } else if (likely(data)) { + if (op == MDBX_GET_BOTH || op == MDBX_GET_BOTH_RANGE) { + if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || + data->iov_len > mc->mc_dbx->md_vlen_max)) { + mdbx_cassert(mc, !"Invalid data-size"); + ret.err = MDBX_BAD_VALSIZE; + return ret; + } + MDBX_val aligned_data = *data; + uint64_t aligned_databytes; + if (mc->mc_db->md_flags & MDBX_INTEGERDUP) { + switch (aligned_data.iov_len) { + default: + mdbx_cassert(mc, !"data-size is invalid for MDBX_INTEGERDUP"); + ret.err = MDBX_BAD_VALSIZE; + return ret; + case 4: + if (unlikely(3 & (uintptr_t)aligned_data.iov_base)) + /* copy instead of return error to avoid break compatibility */ + aligned_data.iov_base = + memcpy(&aligned_databytes, aligned_data.iov_base, 4); + break; + case 8: + if (unlikely(7 & (uintptr_t)aligned_data.iov_base)) + /* copy instead of return error to avoid break compatibility */ + aligned_data.iov_base = + memcpy(&aligned_databytes, aligned_data.iov_base, 8); + break; + } + } + MDBX_val olddata; + ret.err = mdbx_node_read(mc, node, &olddata, + pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn)); + if (unlikely(ret.err != MDBX_SUCCESS)) + return ret; + const int cmp = mc->mc_dbx->md_dcmp(&aligned_data, &olddata); + if (cmp) { + mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); + if (op != MDBX_GET_BOTH_RANGE || cmp > 0) { + ret.err = MDBX_NOTFOUND; + return ret; + } + } + *data = olddata; + } else { + ret.err = mdbx_node_read(mc, node, data, + pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn)); + if (unlikely(ret.err != MDBX_SUCCESS)) + return ret; + } + } + + /* The key already matches in all other cases */ + if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) + get_key_optional(node, key); + + mdbx_debug("==> cursor placed on key [%s], data [%s]", DKEY_DEBUG(key), + DVAL_DEBUG(data)); + ret.err = MDBX_SUCCESS; + return ret; +} + +/* Move the cursor to the first item in the database. */ +static int mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { + int rc; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdbx_page_search(mc, NULL, MDBX_PS_FIRST); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + + if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mc->mc_pg[mc->mc_top]))) + return MDBX_CORRUPTED; + + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + mc->mc_ki[mc->mc_top] = 0; + + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + if (likely(key)) { + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], 0, key->iov_len); + } + return MDBX_SUCCESS; + } + + MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], 0); + if (F_ISSET(node_flags(node), F_DUPDATA)) { + rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc)) + return rc; + } else if (likely(data)) { + if (unlikely((rc = mdbx_node_read( + mc, node, data, + pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != + MDBX_SUCCESS)) + return rc; + } + + get_key_optional(node, key); + return MDBX_SUCCESS; +} + +/* Move the cursor to the last item in the database. */ +static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { + int rc; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdbx_page_search(mc, NULL, MDBX_PS_LAST); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + + if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mc->mc_pg[mc->mc_top]))) + return MDBX_CORRUPTED; + + mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mc->mc_pg[mc->mc_top]) - 1; + mc->mc_flags |= C_INITIALIZED | C_EOF; + + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + if (likely(key)) { + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], + mc->mc_ki[mc->mc_top], key->iov_len); + } + return MDBX_SUCCESS; + } + + MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { + rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc)) + return rc; + } else if (likely(data)) { + if (unlikely((rc = mdbx_node_read( + mc, node, data, + pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != + MDBX_SUCCESS)) + return rc; + } + + get_key_optional(node, key); + return MDBX_SUCCESS; +} + +int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { + if (unlikely(mc == NULL)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + int (*mfunc)(MDBX_cursor * mc, MDBX_val * key, MDBX_val * data); + switch (op) { + case MDBX_GET_CURRENT: { + if (unlikely(!(mc->mc_flags & C_INITIALIZED))) + return MDBX_ENODATA; + MDBX_page *mp = mc->mc_pg[mc->mc_top]; + const unsigned nkeys = page_numkeys(mp); + if (mc->mc_ki[mc->mc_top] >= nkeys) { + mdbx_cassert(mc, nkeys <= UINT16_MAX); + mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; + mc->mc_flags |= C_EOF; + return MDBX_NOTFOUND; + } + mdbx_cassert(mc, nkeys > 0); + + rc = MDBX_SUCCESS; + if (IS_LEAF2(mp)) { + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); + } else { + MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); + get_key_optional(node, key); + if (data) { + if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) { + rc = mdbx_xcursor_init1(mc, node, mp); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc)) + return rc; + } else { + rc = mdbx_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, + MDBX_GET_CURRENT); + if (unlikely(rc)) + return rc; + } + } else { + rc = mdbx_node_read(mc, node, data, pp_txnid4chk(mp, mc->mc_txn)); + if (unlikely(rc)) + return rc; + } + } + } + break; + } + case MDBX_GET_BOTH: + case MDBX_GET_BOTH_RANGE: + if (unlikely(data == NULL)) + return MDBX_EINVAL; + if (unlikely(mc->mc_xcursor == NULL)) + return MDBX_INCOMPATIBLE; + /* fall through */ + __fallthrough; + case MDBX_SET: + case MDBX_SET_KEY: + case MDBX_SET_RANGE: + if (unlikely(key == NULL)) + return MDBX_EINVAL; + rc = mdbx_cursor_set(mc, key, data, op).err; + if (mc->mc_flags & C_INITIALIZED) { + mdbx_cassert(mc, mc->mc_snum > 0 && mc->mc_top < mc->mc_snum); + mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); + } + break; + case MDBX_GET_MULTIPLE: + if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) + return MDBX_EINVAL; + if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) + return MDBX_INCOMPATIBLE; + rc = MDBX_SUCCESS; + if ((mc->mc_xcursor->mx_cursor.mc_flags & (C_INITIALIZED | C_EOF)) != + C_INITIALIZED) + break; + goto fetchm; + case MDBX_NEXT_MULTIPLE: + if (unlikely(data == NULL)) + return MDBX_EINVAL; + if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) + return MDBX_INCOMPATIBLE; + rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT_DUP); + if (rc == MDBX_SUCCESS) { + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + MDBX_cursor *mx; + fetchm: + mx = &mc->mc_xcursor->mx_cursor; + data->iov_len = + page_numkeys(mx->mc_pg[mx->mc_top]) * mx->mc_db->md_xsize; + data->iov_base = page_data(mx->mc_pg[mx->mc_top]); + mx->mc_ki[mx->mc_top] = (indx_t)page_numkeys(mx->mc_pg[mx->mc_top]) - 1; + } else { + rc = MDBX_NOTFOUND; + } + } + break; + case MDBX_PREV_MULTIPLE: + if (data == NULL) + return MDBX_EINVAL; + if (!(mc->mc_db->md_flags & MDBX_DUPFIXED)) + return MDBX_INCOMPATIBLE; + rc = MDBX_SUCCESS; + if (!(mc->mc_flags & C_INITIALIZED)) + rc = mdbx_cursor_last(mc, key, data); + if (rc == MDBX_SUCCESS) { + MDBX_cursor *mx = &mc->mc_xcursor->mx_cursor; + if (mx->mc_flags & C_INITIALIZED) { + rc = mdbx_cursor_sibling(mx, SIBLING_LEFT); + if (rc == MDBX_SUCCESS) + goto fetchm; + } else { + rc = MDBX_NOTFOUND; + } + } + break; + case MDBX_NEXT: + case MDBX_NEXT_DUP: + case MDBX_NEXT_NODUP: + rc = mdbx_cursor_next(mc, key, data, op); + break; + case MDBX_PREV: + case MDBX_PREV_DUP: + case MDBX_PREV_NODUP: + rc = mdbx_cursor_prev(mc, key, data, op); + break; + case MDBX_FIRST: + rc = mdbx_cursor_first(mc, key, data); + break; + case MDBX_FIRST_DUP: + mfunc = mdbx_cursor_first; + mmove: + if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) + return MDBX_EINVAL; + if (unlikely(mc->mc_xcursor == NULL)) + return MDBX_INCOMPATIBLE; + if (mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top])) { + mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mc->mc_pg[mc->mc_top]); + mc->mc_flags |= C_EOF; + return MDBX_NOTFOUND; + } + { + MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!F_ISSET(node_flags(node), F_DUPDATA)) { + get_key_optional(node, key); + rc = mdbx_node_read(mc, node, data, + pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn)); + break; + } + } + if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) + return MDBX_EINVAL; + rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); + break; + case MDBX_LAST: + rc = mdbx_cursor_last(mc, key, data); + break; + case MDBX_LAST_DUP: + mfunc = mdbx_cursor_last; + goto mmove; + case MDBX_SET_LOWERBOUND: { + if (unlikely(key == NULL || data == NULL)) + return MDBX_EINVAL; + MDBX_val save_data = *data; + struct cursor_set_result csr = + mdbx_cursor_set(mc, key, data, MDBX_SET_RANGE); + rc = csr.err; + if (rc == MDBX_SUCCESS && csr.exact && mc->mc_xcursor) { + mc->mc_flags &= ~C_DEL; + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + *data = save_data; + csr = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, + MDBX_SET_RANGE); + rc = csr.err; + if (rc == MDBX_NOTFOUND) { + mdbx_cassert(mc, !csr.exact); + rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT_NODUP); + } + } else { + int cmp = mc->mc_dbx->md_dcmp(&save_data, data); + csr.exact = (cmp == 0); + if (cmp > 0) + rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT_NODUP); + } + } + if (rc == MDBX_SUCCESS && !csr.exact) + rc = MDBX_RESULT_TRUE; + break; + } + default: + mdbx_debug("unhandled/unimplemented cursor operation %u", op); + return MDBX_EINVAL; + } + + mc->mc_flags &= ~C_DEL; + return rc; +} + +static int mdbx_touch_dbi(MDBX_cursor *mc) { + mdbx_cassert(mc, (*mc->mc_dbistate & DBI_DIRTY) == 0); + *mc->mc_dbistate |= DBI_DIRTY; + mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; + if (mc->mc_dbi >= CORE_DBS) { + mdbx_cassert(mc, (mc->mc_flags & C_RECLAIMING) == 0); + /* Touch DB record of named DB */ + MDBX_cursor_couple cx; + int rc = mdbx_cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + mc->mc_txn->mt_dbistate[MAIN_DBI] |= DBI_DIRTY; + rc = mdbx_page_search(&cx.outer, &mc->mc_dbx->md_name, MDBX_PS_MODIFY); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + return MDBX_SUCCESS; +} + +/* Touch all the pages in the cursor stack. Set mc_top. + * Makes sure all the pages are writable, before attempting a write operation. + * [in] mc The cursor to operate on. */ +static int mdbx_cursor_touch(MDBX_cursor *mc) { + int rc = MDBX_SUCCESS; + if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) { + rc = mdbx_touch_dbi(mc); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + if (likely(mc->mc_snum)) { + mc->mc_top = 0; + do { + rc = mdbx_page_touch(mc); + } while (!rc && ++(mc->mc_top) < mc->mc_snum); + mc->mc_top = mc->mc_snum - 1; + } + return rc; +} + +int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, + unsigned flags) { + MDBX_env *env; + MDBX_page *sub_root = NULL; + MDBX_val xdata, *rdata, dkey, olddata; + MDBX_db nested_dupdb; + int err; + DKBUF_DEBUG; + + if (unlikely(mc == NULL || key == NULL || data == NULL)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))) + return MDBX_BAD_DBI; + + mdbx_cassert(mc, cursor_is_tracked(mc)); + env = mc->mc_txn->mt_env; + + /* Check this first so counter will always be zero on any early failures. */ + size_t mcount = 0, dcount = 0; + if (unlikely(flags & MDBX_MULTIPLE)) { + if (unlikely(flags & MDBX_RESERVE)) + return MDBX_EINVAL; + if (unlikely(!F_ISSET(mc->mc_db->md_flags, MDBX_DUPFIXED))) + return MDBX_INCOMPATIBLE; + dcount = data[1].iov_len; + if (unlikely(dcount < 2 || data->iov_len == 0)) + return MDBX_BAD_VALSIZE; + if (unlikely(mc->mc_db->md_xsize != data->iov_len) && mc->mc_db->md_xsize) + return MDBX_BAD_VALSIZE; + if (unlikely(dcount > MAX_MAPSIZE / 2 / + (BRANCH_NODE_MAX(MAX_PAGESIZE) - NODESIZE))) { + /* checking for multiplication overflow */ + if (unlikely(dcount > MAX_MAPSIZE / 2 / data->iov_len)) + return MDBX_TOO_LARGE; + } + data[1].iov_len = 0 /* reset done item counter */; + } + + if (flags & MDBX_RESERVE) { + if (unlikely(mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | + MDBX_INTEGERDUP | MDBX_DUPFIXED))) + return MDBX_INCOMPATIBLE; + data->iov_base = nullptr; + } + + const unsigned nospill = flags & MDBX_NOSPILL; + flags -= nospill; + + if (unlikely(mc->mc_txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) + return (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS + : MDBX_BAD_TXN; + + uint64_t aligned_keybytes, aligned_databytes; + MDBX_val aligned_key, aligned_data; + if (likely((mc->mc_flags & C_SUB) == 0)) { + if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min || + key->iov_len > mc->mc_dbx->md_klen_max)) { + mdbx_cassert(mc, !"Invalid key-size"); + return MDBX_BAD_VALSIZE; + } + if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || + data->iov_len > mc->mc_dbx->md_vlen_max)) { + mdbx_cassert(mc, !"Invalid data-size"); + return MDBX_BAD_VALSIZE; + } + + if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { + switch (key->iov_len) { + default: + mdbx_cassert(mc, !"key-size is invalid for MDBX_INTEGERKEY"); + return MDBX_BAD_VALSIZE; + case 4: + if (unlikely(3 & (uintptr_t)key->iov_base)) { + /* copy instead of return error to avoid break compatibility */ + aligned_key.iov_base = + memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 4); + key = &aligned_key; + } + break; + case 8: + if (unlikely(7 & (uintptr_t)key->iov_base)) { + /* copy instead of return error to avoid break compatibility */ + aligned_key.iov_base = + memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 8); + key = &aligned_key; + } + break; + } + } + if (mc->mc_db->md_flags & MDBX_INTEGERDUP) { + switch (data->iov_len) { + default: + mdbx_cassert(mc, !"data-size is invalid for MDBX_INTEGERKEY"); + return MDBX_BAD_VALSIZE; + case 4: + if (unlikely(3 & (uintptr_t)data->iov_base)) { + if (unlikely(flags & MDBX_MULTIPLE)) + return MDBX_BAD_VALSIZE; + /* copy instead of return error to avoid break compatibility */ + aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base, + aligned_data.iov_len = 4); + data = &aligned_data; + } + break; + case 8: + if (unlikely(7 & (uintptr_t)data->iov_base)) { + if (unlikely(flags & MDBX_MULTIPLE)) + return MDBX_BAD_VALSIZE; + /* copy instead of return error to avoid break compatibility */ + aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base, + aligned_data.iov_len = 8); + data = &aligned_data; + } + break; + } + } + } + + mdbx_debug( + "==> put db %d key [%s], size %" PRIuPTR ", data [%s] size %" PRIuPTR, + DDBI(mc), DKEY_DEBUG(key), key->iov_len, + DVAL_DEBUG((flags & MDBX_RESERVE) ? nullptr : data), data->iov_len); + + int dupdata_flag = 0; + if ((flags & MDBX_CURRENT) != 0 && (mc->mc_flags & C_SUB) == 0) { + if (unlikely(flags & (MDBX_APPEND | MDBX_NOOVERWRITE))) + return MDBX_EINVAL; + /* Опция MDBX_CURRENT означает, что запрошено обновление текущей записи, + * на которой сейчас стоит курсор. Проверяем что переданный ключ совпадает + * со значением в текущей позиции курсора. + * Здесь проще вызвать mdbx_cursor_get(), так как для обслуживания таблиц + * с MDBX_DUPSORT также требуется текущий размер данных. */ + MDBX_val current_key, current_data; + rc = mdbx_cursor_get(mc, ¤t_key, ¤t_data, MDBX_GET_CURRENT); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + if (mc->mc_dbx->md_cmp(key, ¤t_key) != 0) + return MDBX_EKEYMISMATCH; + + if (unlikely((flags & MDBX_MULTIPLE))) + goto drop_current; + + if (F_ISSET(mc->mc_db->md_flags, MDBX_DUPSORT)) { + MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { + mdbx_cassert(mc, + mc->mc_xcursor != NULL && + (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); + /* Если за ключом более одного значения, либо если размер данных + * отличается, то вместо обновления требуется удаление и + * последующая вставка. */ + if (mc->mc_xcursor->mx_db.md_entries > 1 || + current_data.iov_len != data->iov_len) { + drop_current: + rc = mdbx_cursor_del(mc, flags & MDBX_ALLDUPS); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + flags -= MDBX_CURRENT; + goto skip_check_samedata; + } + } else if (unlikely(node_size(key, data) > env->me_leaf_nodemax)) { + rc = mdbx_cursor_del(mc, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + flags -= MDBX_CURRENT; + goto skip_check_samedata; + } + } + if (!(flags & MDBX_RESERVE) && + unlikely(cmp_lenfast(¤t_data, data) == 0)) + return MDBX_SUCCESS /* the same data, nothing to update */; + skip_check_samedata:; + } + + if (mc->mc_db->md_root == P_INVALID) { + /* new database, cursor has nothing to point to */ + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_flags &= ~C_INITIALIZED; + rc = MDBX_NO_ROOT; + } else if ((flags & MDBX_CURRENT) == 0) { + bool exact = false; + if ((flags & MDBX_APPEND) && mc->mc_db->md_entries > 0) { + rc = mdbx_cursor_last(mc, &dkey, &olddata); + if (likely(rc == MDBX_SUCCESS)) { + rc = mc->mc_dbx->md_cmp(key, &dkey); + if (likely(rc > 0)) { + mc->mc_ki[mc->mc_top]++; /* step forward for appending */ + rc = MDBX_NOTFOUND; + } else { + if (unlikely(rc != MDBX_SUCCESS || !(flags & MDBX_APPENDDUP))) + /* new-key < last-key + * or new-key == last-key without MDBX_APPENDDUP */ + return MDBX_EKEYMISMATCH; + exact = true; + } + } + } else { + struct cursor_set_result csr = + mdbx_cursor_set(mc, (MDBX_val *)key, &olddata, MDBX_SET); + rc = csr.err; + exact = csr.exact; + } + if (likely(rc == MDBX_SUCCESS)) { + if (exact) { + if (unlikely(flags & MDBX_NOOVERWRITE)) { + mdbx_debug("duplicate key [%s]", DKEY_DEBUG(key)); + *data = olddata; + return MDBX_KEYEXIST; + } + if (unlikely(mc->mc_flags & C_SUB)) { + /* nested subtree of DUPSORT-database with the same key, + * nothing to update */ + mdbx_assert(env, data->iov_len == 0 && olddata.iov_len == 0); + return MDBX_SUCCESS; + } + if (unlikely(flags & MDBX_ALLDUPS) && mc->mc_xcursor && + (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { + rc = mdbx_cursor_del(mc, MDBX_ALLDUPS); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + flags -= MDBX_ALLDUPS; + rc = MDBX_NOTFOUND; + exact = false; + } else /* checking for early exit without dirtying pages */ + if (!(flags & (MDBX_RESERVE | MDBX_MULTIPLE)) && + unlikely(mc->mc_dbx->md_dcmp(data, &olddata) == 0)) { + if (!mc->mc_xcursor) + /* the same data, nothing to update */ + return MDBX_SUCCESS; + if (flags & MDBX_NODUPDATA) + return MDBX_KEYEXIST; + if (flags & MDBX_APPENDDUP) + return MDBX_EKEYMISMATCH; + if (likely(unsure_equal(mc->mc_dbx->md_dcmp, data, &olddata))) + /* data is match exactly byte-to-byte, nothing to update */ + return MDBX_SUCCESS; + else { + /* The data has differences, but the user-provided comparator + * considers them equal. So continue update since called without. + * Continue to update since was called without MDBX_NODUPDATA. */ + } + } + } + } else if (unlikely(rc != MDBX_NOTFOUND)) + return rc; + } + + mc->mc_flags &= ~C_DEL; + + /* Cursor is positioned, check for room in the dirty list */ + if (!nospill) { + rdata = data; + if (unlikely(flags & MDBX_MULTIPLE)) { + rdata = &xdata; + xdata.iov_len = data->iov_len * dcount; + } + if (unlikely(err = mdbx_cursor_spill(mc, key, rdata))) + return err; + } + + if (unlikely(rc == MDBX_NO_ROOT)) { + /* new database, write a root leaf page */ + mdbx_debug("%s", "allocating new root leaf page"); + if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) { + err = mdbx_touch_dbi(mc); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + struct page_result npr = mdbx_page_new(mc, P_LEAF, 1); + if (unlikely(npr.err != MDBX_SUCCESS)) + return npr.err; + npr.err = mdbx_cursor_push(mc, npr.page); + if (unlikely(npr.err != MDBX_SUCCESS)) + return npr.err; + mc->mc_db->md_root = npr.page->mp_pgno; + mc->mc_db->md_depth++; + if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { + assert(key->iov_len >= mc->mc_dbx->md_klen_min && + key->iov_len <= mc->mc_dbx->md_klen_max); + mc->mc_dbx->md_klen_min = mc->mc_dbx->md_klen_max = key->iov_len; + } + if (mc->mc_db->md_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) { + assert(data->iov_len >= mc->mc_dbx->md_vlen_min && + data->iov_len <= mc->mc_dbx->md_vlen_max); + assert(mc->mc_xcursor != NULL); + mc->mc_db->md_xsize = mc->mc_xcursor->mx_db.md_xsize = + (unsigned)(mc->mc_dbx->md_vlen_min = mc->mc_dbx->md_vlen_max = + mc->mc_xcursor->mx_dbx.md_klen_min = + mc->mc_xcursor->mx_dbx.md_klen_max = + data->iov_len); + } + if ((mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_DUPFIXED)) == MDBX_DUPFIXED) + npr.page->mp_flags |= P_LEAF2; + mc->mc_flags |= C_INITIALIZED; + } else { + /* make sure all cursor pages are writable */ + err = mdbx_cursor_touch(mc); + if (unlikely(err)) + return err; + } + + bool insert_key, insert_data, do_sub = false; + insert_key = insert_data = (rc != MDBX_SUCCESS); + uint16_t fp_flags = P_LEAF; + MDBX_page *fp = env->me_pbuf; + fp->mp_txnid = mc->mc_txn->mt_front; + if (insert_key) { + /* The key does not exist */ + mdbx_debug("inserting key at index %i", mc->mc_ki[mc->mc_top]); + if ((mc->mc_db->md_flags & MDBX_DUPSORT) && + node_size(key, data) > env->me_leaf_nodemax) { + /* Too big for a node, insert in sub-DB. Set up an empty + * "old sub-page" for prep_subDB to expand to a full page. */ + fp->mp_leaf2_ksize = + (mc->mc_db->md_flags & MDBX_DUPFIXED) ? (uint16_t)data->iov_len : 0; + fp->mp_lower = fp->mp_upper = 0; + olddata.iov_len = PAGEHDRSZ; + goto prep_subDB; + } + } else { + /* there's only a key anyway, so this is a no-op */ + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + char *ptr; + unsigned ksize = mc->mc_db->md_xsize; + if (unlikely(key->iov_len != ksize)) + return MDBX_BAD_VALSIZE; + ptr = page_leaf2key(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); + memcpy(ptr, key->iov_base, ksize); + fix_parent: + /* if overwriting slot 0 of leaf, need to + * update branch key if there is a parent page */ + if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { + unsigned dtop = 1; + mc->mc_top--; + /* slot 0 is always an empty key, find real slot */ + while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { + mc->mc_top--; + dtop++; + } + err = MDBX_SUCCESS; + if (mc->mc_ki[mc->mc_top]) + err = mdbx_update_key(mc, key); + mdbx_cassert(mc, mc->mc_top + dtop < UINT16_MAX); + mc->mc_top += (uint16_t)dtop; + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + + if (mdbx_audit_enabled()) { + err = mdbx_cursor_check(mc, 0); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + return MDBX_SUCCESS; + } + + more:; + if (mdbx_audit_enabled()) { + err = mdbx_cursor_check(mc, 0); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + + /* Large/Overflow page overwrites need special handling */ + if (unlikely(F_ISSET(node_flags(node), F_BIGDATA))) { + int dpages = (node_size(key, data) > env->me_leaf_nodemax) + ? number_of_ovpages(env, data->iov_len) + : 0; + + const pgno_t pgno = node_largedata_pgno(node); + struct page_result pgr = mdbx_page_get_ex( + mc, pgno, pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn)); + if (unlikely(pgr.err != MDBX_SUCCESS)) + return pgr.err; + if (unlikely(!IS_OVERFLOW(pgr.page))) + return MDBX_CORRUPTED; + + /* Is the ov page from this txn (or a parent) and big enough? */ + int ovpages = pgr.page->mp_pages; + if (!IS_FROZEN(mc->mc_txn, pgr.page) && + (unlikely(mc->mc_flags & C_GCFREEZE) + ? (ovpages >= dpages) + : (ovpages == + /* LY: add configurable threshold to keep reserve space */ + dpages))) { + /* yes, overwrite it. */ + if (!IS_MODIFIABLE(mc->mc_txn, pgr.page)) { + if (IS_SPILLED(mc->mc_txn, pgr.page)) { + pgr = /* TODO: avoid search and get txn & spill-index from + page_result */ + mdbx_page_unspill(mc->mc_txn, pgr.page); + if (unlikely(pgr.err)) + return pgr.err; + } else { + if (unlikely(!mc->mc_txn->mt_parent)) { + mdbx_error( + "Unexpected not frozen/modifiable/spilled but shadowed %s " + "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," + " without parent transaction, current txn %" PRIaTXN + " front %" PRIaTXN, + "overflow/large", pgno, pgr.page->mp_txnid, + mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); + return MDBX_PROBLEM; + } + + /* It is writable only in a parent txn */ + MDBX_page *np = mdbx_page_malloc(mc->mc_txn, ovpages); + if (unlikely(!np)) + return MDBX_ENOMEM; + + memcpy(np, pgr.page, PAGEHDRSZ); /* Copy header of page */ + err = mdbx_page_dirty(mc->mc_txn, pgr.page = np, ovpages); + if (unlikely(err != MDBX_SUCCESS)) + return err; + +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&mc->mc_txn->mt_env->me_lck->mti_pgop_stat.clone, + ovpages); +#endif /* MDBX_ENABLE_PGOP_STAT */ + mdbx_cassert(mc, mdbx_dirtylist_check(mc->mc_txn)); + } + } + node_set_ds(node, data->iov_len); + if (F_ISSET(flags, MDBX_RESERVE)) + data->iov_base = page_data(pgr.page); + else + memcpy(page_data(pgr.page), data->iov_base, data->iov_len); + + if (mdbx_audit_enabled()) { + err = mdbx_cursor_check(mc, 0); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + return MDBX_SUCCESS; + } + + if ((err = mdbx_page_retire(mc, pgr.page)) != MDBX_SUCCESS) + return err; + } else { + olddata.iov_len = node_ds(node); + olddata.iov_base = node_data(node); + mdbx_cassert(mc, (char *)olddata.iov_base + olddata.iov_len <= + (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); + + /* DB has dups? */ + if (F_ISSET(mc->mc_db->md_flags, MDBX_DUPSORT)) { + /* Prepare (sub-)page/sub-DB to accept the new item, if needed. + * fp: old sub-page or a header faking it. + * mp: new (sub-)page. offset: growth in page size. + * xdata: node data with new page or DB. */ + unsigned i; + size_t offset = 0; + MDBX_page *mp = fp = xdata.iov_base = env->me_pbuf; + mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; + + /* Was a single item before, must convert now */ + if (!F_ISSET(node_flags(node), F_DUPDATA)) { + + /* does data match? */ + const int cmp = mc->mc_dbx->md_dcmp(data, &olddata); + if ((flags & MDBX_APPENDDUP) && unlikely(cmp <= 0)) + return MDBX_EKEYMISMATCH; + if (cmp == 0) { + if (flags & MDBX_NODUPDATA) + return MDBX_KEYEXIST; + if (likely(unsure_equal(mc->mc_dbx->md_dcmp, data, &olddata))) { + /* data is match exactly byte-to-byte, nothing to update */ + if (unlikely(flags & MDBX_MULTIPLE)) { + rc = MDBX_SUCCESS; + goto continue_multiple; + } + return MDBX_SUCCESS; + } else { + /* The data has differences, but the user-provided comparator + * considers them equal. So continue update since called without. + * Continue to update since was called without MDBX_NODUPDATA. */ + } + mdbx_cassert(mc, node_size(key, data) <= env->me_leaf_nodemax); + goto current; + } + + /* Just overwrite the current item */ + if (flags & MDBX_CURRENT) { + mdbx_cassert(mc, node_size(key, data) <= env->me_leaf_nodemax); + goto current; + } + + /* Back up original data item */ + memcpy(dkey.iov_base = fp + 1, olddata.iov_base, + dkey.iov_len = olddata.iov_len); + dupdata_flag = 1; + + /* Make sub-page header for the dup items, with dummy body */ + fp->mp_flags = P_LEAF | P_SUBP; + fp->mp_lower = 0; + xdata.iov_len = PAGEHDRSZ + dkey.iov_len + data->iov_len; + if (mc->mc_db->md_flags & MDBX_DUPFIXED) { + fp->mp_flags |= P_LEAF2; + fp->mp_leaf2_ksize = (uint16_t)data->iov_len; + xdata.iov_len += 2 * data->iov_len; /* leave space for 2 more */ + mdbx_cassert(mc, xdata.iov_len <= env->me_psize); + } else { + xdata.iov_len += 2 * (sizeof(indx_t) + NODESIZE) + + (dkey.iov_len & 1) + (data->iov_len & 1); + mdbx_cassert(mc, xdata.iov_len <= env->me_psize); + } + fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEHDRSZ); + olddata.iov_len = xdata.iov_len; /* pretend olddata is fp */ + } else if (node_flags(node) & F_SUBDATA) { + /* Data is on sub-DB, just store it */ + flags |= F_DUPDATA | F_SUBDATA; + goto put_sub; + } else { + /* Data is on sub-page */ + fp = olddata.iov_base; + switch (flags) { + default: + if (!(mc->mc_db->md_flags & MDBX_DUPFIXED)) { + offset = node_size(data, nullptr) + sizeof(indx_t); + break; + } + offset = fp->mp_leaf2_ksize; + if (page_room(fp) < offset) { + offset *= 4; /* space for 4 more */ + break; + } + /* FALLTHRU: Big enough MDBX_DUPFIXED sub-page */ + __fallthrough; + case MDBX_CURRENT | MDBX_NODUPDATA: + case MDBX_CURRENT: + fp->mp_txnid = mc->mc_txn->mt_front; + fp->mp_pgno = mp->mp_pgno; + mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; + flags |= F_DUPDATA; + goto put_sub; + } + xdata.iov_len = olddata.iov_len + offset; + } + + fp_flags = fp->mp_flags; + if (node_size_len(node_ks(node), xdata.iov_len) > + env->me_leaf_nodemax) { + /* Too big for a sub-page, convert to sub-DB */ + fp_flags &= ~P_SUBP; + prep_subDB: + nested_dupdb.md_xsize = 0; + nested_dupdb.md_flags = flags_db2sub(mc->mc_db->md_flags); + if (mc->mc_db->md_flags & MDBX_DUPFIXED) { + fp_flags |= P_LEAF2; + nested_dupdb.md_xsize = fp->mp_leaf2_ksize; + } + nested_dupdb.md_depth = 1; + nested_dupdb.md_branch_pages = 0; + nested_dupdb.md_leaf_pages = 1; + nested_dupdb.md_overflow_pages = 0; + nested_dupdb.md_entries = page_numkeys(fp); + xdata.iov_len = sizeof(nested_dupdb); + xdata.iov_base = &nested_dupdb; + const struct page_result par = mdbx_page_alloc(mc, 1, MDBX_ALLOC_ALL); + mp = par.page; + if (unlikely(par.err != MDBX_SUCCESS)) + return par.err; + mc->mc_db->md_leaf_pages += 1; + mdbx_cassert(mc, env->me_psize > olddata.iov_len); + offset = env->me_psize - (unsigned)olddata.iov_len; + flags |= F_DUPDATA | F_SUBDATA; + nested_dupdb.md_root = mp->mp_pgno; + nested_dupdb.md_seq = 0; + nested_dupdb.md_mod_txnid = mc->mc_txn->mt_txnid; + sub_root = mp; + } + if (mp != fp) { + mp->mp_flags = fp_flags; + mp->mp_txnid = mc->mc_txn->mt_front; + mp->mp_leaf2_ksize = fp->mp_leaf2_ksize; + mp->mp_lower = fp->mp_lower; + mdbx_cassert(mc, fp->mp_upper + offset <= UINT16_MAX); + mp->mp_upper = (indx_t)(fp->mp_upper + offset); + if (unlikely(fp_flags & P_LEAF2)) { + memcpy(page_data(mp), page_data(fp), + page_numkeys(fp) * fp->mp_leaf2_ksize); + } else { + memcpy((char *)mp + mp->mp_upper + PAGEHDRSZ, + (char *)fp + fp->mp_upper + PAGEHDRSZ, + olddata.iov_len - fp->mp_upper - PAGEHDRSZ); + memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs), + page_numkeys(fp) * sizeof(mp->mp_ptrs[0])); + for (i = 0; i < page_numkeys(fp); i++) { + mdbx_cassert(mc, mp->mp_ptrs[i] + offset <= UINT16_MAX); + mp->mp_ptrs[i] += (indx_t)offset; + } + } + } + + rdata = &xdata; + flags |= F_DUPDATA; + do_sub = true; + if (!insert_key) + mdbx_node_del(mc, 0); + goto new_sub; + } + + /* MDBX passes F_SUBDATA in 'flags' to write a DB record */ + if (unlikely((node_flags(node) ^ flags) & F_SUBDATA)) + return MDBX_INCOMPATIBLE; + + current: + if (data->iov_len == olddata.iov_len) { + mdbx_cassert(mc, EVEN(key->iov_len) == EVEN(node_ks(node))); + /* same size, just replace it. Note that we could + * also reuse this node if the new data is smaller, + * but instead we opt to shrink the node in that case. */ + if (F_ISSET(flags, MDBX_RESERVE)) + data->iov_base = olddata.iov_base; + else if (!(mc->mc_flags & C_SUB)) + memcpy(olddata.iov_base, data->iov_base, data->iov_len); + else { + mdbx_cassert(mc, page_numkeys(mc->mc_pg[mc->mc_top]) == 1); + mdbx_cassert(mc, PAGETYPE(mc->mc_pg[mc->mc_top]) == P_LEAF); + mdbx_cassert(mc, node_ds(node) == 0); + mdbx_cassert(mc, node_flags(node) == 0); + mdbx_cassert(mc, key->iov_len < UINT16_MAX); + node_set_ks(node, key->iov_len); + memcpy(node_key(node), key->iov_base, key->iov_len); + mdbx_cassert(mc, (char *)node_key(node) + node_ds(node) < + (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); + goto fix_parent; + } + + if (mdbx_audit_enabled()) { + err = mdbx_cursor_check(mc, 0); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + return MDBX_SUCCESS; + } + } + mdbx_node_del(mc, 0); + } + + rdata = data; + +new_sub:; + unsigned nflags = flags & NODE_ADD_FLAGS; + size_t nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->iov_len + : leaf_size(env, key, rdata); + if (page_room(mc->mc_pg[mc->mc_top]) < nsize) { + if (!insert_key) + nflags |= MDBX_SPLIT_REPLACE; + rc = mdbx_page_split(mc, key, rdata, P_INVALID, nflags); + if (rc == MDBX_SUCCESS && mdbx_audit_enabled()) + rc = mdbx_cursor_check(mc, 0); + } else { + /* There is room already in this leaf page. */ + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + mdbx_cassert(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0 && + rdata->iov_len == 0); + rc = mdbx_node_add_leaf2(mc, mc->mc_ki[mc->mc_top], key); + } else + rc = mdbx_node_add_leaf(mc, mc->mc_ki[mc->mc_top], key, rdata, nflags); + if (likely(rc == 0)) { + /* Adjust other cursors pointing to mp */ + const MDBX_dbi dbi = mc->mc_dbi; + const unsigned i = mc->mc_top; + MDBX_page *const mp = mc->mc_pg[i]; + for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[dbi]; m2; + m2 = m2->mc_next) { + MDBX_cursor *m3 = + (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) + continue; + if (m3->mc_ki[i] >= mc->mc_ki[i]) + m3->mc_ki[i] += insert_key; + if (XCURSOR_INITED(m3)) + XCURSOR_REFRESH(m3, mp, m3->mc_ki[i]); + } + } + } + + if (likely(rc == MDBX_SUCCESS)) { + /* Now store the actual data in the child DB. Note that we're + * storing the user data in the keys field, so there are strict + * size limits on dupdata. The actual data fields of the child + * DB are all zero size. */ + if (do_sub) { + int xflags; + size_t ecount; + put_sub: + xdata.iov_len = 0; + xdata.iov_base = nullptr; + MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); +#define SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE 1 + STATIC_ASSERT( + (MDBX_NODUPDATA >> SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE) == + MDBX_NOOVERWRITE); + xflags = MDBX_CURRENT | MDBX_NOSPILL | + ((flags & MDBX_NODUPDATA) >> + SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE); + if ((flags & MDBX_CURRENT) == 0) { + xflags -= MDBX_CURRENT; + err = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + if (sub_root) + mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; + /* converted, write the original data first */ + if (dupdata_flag) { + rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); + if (unlikely(rc)) + goto bad_sub; + /* we've done our job */ + dkey.iov_len = 0; + } + if (!(node_flags(node) & F_SUBDATA) || sub_root) { + /* Adjust other cursors pointing to mp */ + MDBX_cursor *m2; + MDBX_xcursor *mx = mc->mc_xcursor; + unsigned i = mc->mc_top; + MDBX_page *mp = mc->mc_pg[i]; + const int nkeys = page_numkeys(mp); + + for (m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { + if (m2 == mc || m2->mc_snum < mc->mc_snum) + continue; + if (!(m2->mc_flags & C_INITIALIZED)) + continue; + if (m2->mc_pg[i] == mp) { + if (m2->mc_ki[i] == mc->mc_ki[i]) { + err = mdbx_xcursor_init2(m2, mx, dupdata_flag); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } else if (!insert_key && m2->mc_ki[i] < nkeys) { + XCURSOR_REFRESH(m2, mp, m2->mc_ki[i]); + } + } + } + } + mdbx_cassert(mc, mc->mc_xcursor->mx_db.md_entries < PTRDIFF_MAX); + ecount = (size_t)mc->mc_xcursor->mx_db.md_entries; +#define SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND 1 + STATIC_ASSERT((MDBX_APPENDDUP >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND) == + MDBX_APPEND); + xflags |= (flags & MDBX_APPENDDUP) >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND; + rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); + if (flags & F_SUBDATA) { + void *db = node_data(node); + mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid; + memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); + } + insert_data = (ecount != (size_t)mc->mc_xcursor->mx_db.md_entries); + } + /* Increment count unless we just replaced an existing item. */ + if (insert_data) + mc->mc_db->md_entries++; + if (insert_key) { + /* Invalidate txn if we created an empty sub-DB */ + if (unlikely(rc)) + goto bad_sub; + /* If we succeeded and the key didn't exist before, + * make sure the cursor is marked valid. */ + mc->mc_flags |= C_INITIALIZED; + } + if (unlikely(flags & MDBX_MULTIPLE)) { + if (likely(rc == MDBX_SUCCESS)) { + continue_multiple: + mcount++; + /* let caller know how many succeeded, if any */ + data[1].iov_len = mcount; + if (mcount < dcount) { + data[0].iov_base = (char *)data[0].iov_base + data[0].iov_len; + insert_key = insert_data = false; + goto more; + } + } + } + if (rc == MDBX_SUCCESS && mdbx_audit_enabled()) + rc = mdbx_cursor_check(mc, 0); + return rc; + bad_sub: + if (unlikely(rc == MDBX_KEYEXIST)) { + /* should not happen, we deleted that item */ + mdbx_error("Unexpected %i error while put to nested dupsort's hive", rc); + rc = MDBX_PROBLEM; + } + } + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; + return rc; +} + +int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { + if (unlikely(!mc)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))) + return MDBX_BAD_DBI; + + if (unlikely(!(mc->mc_flags & C_INITIALIZED))) + return MDBX_ENODATA; + + if (unlikely(mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top]))) + return MDBX_NOTFOUND; + + if (likely((flags & MDBX_NOSPILL) == 0) && + unlikely(rc = mdbx_cursor_spill(mc, NULL, NULL))) + return rc; + + rc = mdbx_cursor_touch(mc); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + MDBX_page *mp = mc->mc_pg[mc->mc_top]; + if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mp))) + return MDBX_CORRUPTED; + if (IS_LEAF2(mp)) + goto del_key; + + MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (flags & (MDBX_ALLDUPS | /* for compatibility */ MDBX_NODUPDATA)) { + /* mdbx_cursor_del0() will subtract the final entry */ + mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; + mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; + } else { + if (!F_ISSET(node_flags(node), F_SUBDATA)) + mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); + rc = mdbx_cursor_del(&mc->mc_xcursor->mx_cursor, MDBX_NOSPILL); + if (unlikely(rc)) + return rc; + /* If sub-DB still has entries, we're done */ + if (mc->mc_xcursor->mx_db.md_entries) { + if (node_flags(node) & F_SUBDATA) { + /* update subDB info */ + void *db = node_data(node); + mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid; + memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); + } else { + MDBX_cursor *m2; + /* shrink fake page */ + mdbx_node_shrink(mp, mc->mc_ki[mc->mc_top]); + node = page_node(mp, mc->mc_ki[mc->mc_top]); + mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); + /* fix other sub-DB cursors pointed at fake pages on this page */ + for (m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { + if (m2 == mc || m2->mc_snum < mc->mc_snum) + continue; + if (!(m2->mc_flags & C_INITIALIZED)) + continue; + if (m2->mc_pg[mc->mc_top] == mp) { + MDBX_node *inner = node; + if (m2->mc_ki[mc->mc_top] >= page_numkeys(mp)) + continue; + if (m2->mc_ki[mc->mc_top] != mc->mc_ki[mc->mc_top]) { + inner = page_node(mp, m2->mc_ki[mc->mc_top]); + if (node_flags(inner) & F_SUBDATA) + continue; + } + m2->mc_xcursor->mx_cursor.mc_pg[0] = node_data(inner); + } + } + } + mc->mc_db->md_entries--; + mdbx_cassert(mc, mc->mc_db->md_entries > 0 && mc->mc_db->md_depth > 0 && + mc->mc_db->md_root != P_INVALID); + return rc; + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; + } + /* otherwise fall thru and delete the sub-DB */ + } + + if (node_flags(node) & F_SUBDATA) { + /* add all the child DB's pages to the free list */ + rc = mdbx_drop_tree(&mc->mc_xcursor->mx_cursor, false); + if (unlikely(rc)) + goto fail; + } + } + /* MDBX passes F_SUBDATA in 'flags' to delete a DB record */ + else if (unlikely((node_flags(node) ^ flags) & F_SUBDATA)) + return MDBX_INCOMPATIBLE; + + /* add overflow pages to free list */ + if (F_ISSET(node_flags(node), F_BIGDATA)) { + MDBX_page *omp; + if (unlikely((rc = mdbx_page_get(mc, node_largedata_pgno(node), &omp, + pp_txnid4chk(mp, mc->mc_txn))) || + (rc = mdbx_page_retire(mc, omp)))) + goto fail; + } + +del_key: + return mdbx_cursor_del0(mc); + +fail: + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; + return rc; +} + +/* Allocate and initialize new pages for a database. + * Set MDBX_TXN_ERROR on failure. + * + * [in] mc a cursor on the database being added to. + * [in] flags flags defining what type of page is being allocated. + * [in] num the number of pages to allocate. This is usually 1, + * unless allocating overflow pages for a large record. + * [out] mp Address of a page, or NULL on failure. + * + * Returns 0 on success, non-zero on failure. */ +static struct page_result mdbx_page_new(MDBX_cursor *mc, const unsigned flags, + const unsigned npages) { + struct page_result ret = mdbx_page_alloc(mc, npages, MDBX_ALLOC_ALL); + if (unlikely(ret.err != MDBX_SUCCESS)) + return ret; + + mdbx_debug("db %u allocated new page %" PRIaPGNO ", num %u", mc->mc_dbi, + ret.page->mp_pgno, npages); + ret.page->mp_flags = (uint16_t)flags; + ret.page->mp_txnid = mc->mc_txn->mt_front; + mdbx_cassert(mc, *mc->mc_dbistate & DBI_DIRTY); + mdbx_cassert(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly, npages); +#endif /* MDBX_ENABLE_PGOP_STAT */ + + if (likely((flags & P_OVERFLOW) == 0)) { + STATIC_ASSERT(P_BRANCH == 1); + const bool is_branch = flags & P_BRANCH; + ret.page->mp_lower = 0; + ret.page->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ); + mc->mc_db->md_branch_pages += is_branch; + mc->mc_db->md_leaf_pages += 1 - is_branch; + if (unlikely(mc->mc_flags & C_SUB)) { + MDBX_db *outer = mdbx_outer_db(mc); + outer->md_branch_pages += is_branch; + outer->md_leaf_pages += 1 - is_branch; + } + } else { + mc->mc_db->md_overflow_pages += npages; + ret.page->mp_pages = npages; + mdbx_cassert(mc, !(mc->mc_flags & C_SUB)); + } + + return ret; +} + +static int __must_check_result mdbx_node_add_leaf2(MDBX_cursor *mc, + unsigned indx, + const MDBX_val *key) { + MDBX_page *mp = mc->mc_pg[mc->mc_top]; + DKBUF_DEBUG; + mdbx_debug("add to leaf2-%spage %" PRIaPGNO " index %i, " + " key size %" PRIuPTR " [%s]", + IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, + key ? key->iov_len : 0, DKEY_DEBUG(key)); + + mdbx_cassert(mc, key); + mdbx_cassert(mc, PAGETYPE(mp) == (P_LEAF | P_LEAF2)); + const unsigned ksize = mc->mc_db->md_xsize; + mdbx_cassert(mc, ksize == key->iov_len); + const unsigned nkeys = page_numkeys(mp); + + /* Just using these for counting */ + const intptr_t lower = mp->mp_lower + sizeof(indx_t); + const intptr_t upper = mp->mp_upper - (ksize - sizeof(indx_t)); + if (unlikely(lower > upper)) { + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_PAGE_FULL; + } + mp->mp_lower = (indx_t)lower; + mp->mp_upper = (indx_t)upper; + + char *const ptr = page_leaf2key(mp, indx, ksize); + mdbx_cassert(mc, nkeys >= indx); + const unsigned diff = nkeys - indx; + if (likely(diff > 0)) + /* Move higher keys up one slot. */ + memmove(ptr + ksize, ptr, diff * ksize); + /* insert new key */ + memcpy(ptr, key->iov_base, ksize); + return MDBX_SUCCESS; +} + +static int __must_check_result mdbx_node_add_branch(MDBX_cursor *mc, + unsigned indx, + const MDBX_val *key, + pgno_t pgno) { + MDBX_page *mp = mc->mc_pg[mc->mc_top]; + DKBUF_DEBUG; + mdbx_debug("add to branch-%spage %" PRIaPGNO " index %i, node-pgno %" PRIaPGNO + " key size %" PRIuPTR " [%s]", + IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, pgno, + key ? key->iov_len : 0, DKEY_DEBUG(key)); + + mdbx_cassert(mc, PAGETYPE(mp) == P_BRANCH); + STATIC_ASSERT(NODESIZE % 2 == 0); + + /* Move higher pointers up one slot. */ + const unsigned nkeys = page_numkeys(mp); + mdbx_cassert(mc, nkeys >= indx); + for (unsigned i = nkeys; i > indx; --i) + mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; + + /* Adjust free space offsets. */ + const size_t branch_bytes = branch_size(mc->mc_txn->mt_env, key); + const intptr_t lower = mp->mp_lower + sizeof(indx_t); + const intptr_t upper = mp->mp_upper - (branch_bytes - sizeof(indx_t)); + if (unlikely(lower > upper)) { + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_PAGE_FULL; + } + mp->mp_lower = (indx_t)lower; + mp->mp_ptrs[indx] = mp->mp_upper = (indx_t)upper; + + /* Write the node data. */ + MDBX_node *node = page_node(mp, indx); + node_set_pgno(node, pgno); + node_set_flags(node, 0); + UNALIGNED_POKE_8(node, MDBX_node, mn_extra, 0); + node_set_ks(node, 0); + if (likely(key != NULL)) { + node_set_ks(node, key->iov_len); + memcpy(node_key(node), key->iov_base, key->iov_len); + } + return MDBX_SUCCESS; +} + +static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, + unsigned indx, + const MDBX_val *key, + MDBX_val *data, + unsigned flags) { + MDBX_page *mp = mc->mc_pg[mc->mc_top]; + DKBUF_DEBUG; + mdbx_debug("add to leaf-%spage %" PRIaPGNO " index %i, data size %" PRIuPTR + " key size %" PRIuPTR " [%s]", + IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, + data ? data->iov_len : 0, key ? key->iov_len : 0, DKEY_DEBUG(key)); + mdbx_cassert(mc, key != NULL && data != NULL); + mdbx_cassert(mc, PAGETYPE(mp) == P_LEAF); + mdbx_cassert(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data)); + MDBX_page *largepage = NULL; + + size_t node_bytes; + if (unlikely(flags & F_BIGDATA)) { + /* Data already on overflow page. */ + STATIC_ASSERT(sizeof(pgno_t) % 2 == 0); + node_bytes = + node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t); + } else if (unlikely(node_size(key, data) > + mc->mc_txn->mt_env->me_leaf_nodemax)) { + /* Put data on overflow page. */ + if (unlikely(mc->mc_db->md_flags & MDBX_DUPSORT)) { + mdbx_error("Unexpected target %s flags 0x%x for large data-item", + "dupsort-db", mc->mc_db->md_flags); + return MDBX_PROBLEM; + } + if (unlikely(flags & (F_DUPDATA | F_SUBDATA))) { + mdbx_error("Unexpected target %s flags 0x%x for large data-item", "node", + flags); + return MDBX_PROBLEM; + } + const pgno_t ovpages = number_of_ovpages(mc->mc_txn->mt_env, data->iov_len); + const struct page_result npr = mdbx_page_new(mc, P_OVERFLOW, ovpages); + if (unlikely(npr.err != MDBX_SUCCESS)) + return npr.err; + largepage = npr.page; + mdbx_debug("allocated %u overflow page(s) %" PRIaPGNO "for %" PRIuPTR + " data bytes", + largepage->mp_pages, largepage->mp_pgno, data->iov_len); + flags |= F_BIGDATA; + node_bytes = + node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t); + } else { + node_bytes = node_size(key, data) + sizeof(indx_t); + } + mdbx_cassert(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data)); + + /* Move higher pointers up one slot. */ + const unsigned nkeys = page_numkeys(mp); + mdbx_cassert(mc, nkeys >= indx); + for (unsigned i = nkeys; i > indx; --i) + mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; + + /* Adjust free space offsets. */ + const intptr_t lower = mp->mp_lower + sizeof(indx_t); + const intptr_t upper = mp->mp_upper - (node_bytes - sizeof(indx_t)); + if (unlikely(lower > upper)) { + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_PAGE_FULL; + } + mp->mp_lower = (indx_t)lower; + mp->mp_ptrs[indx] = mp->mp_upper = (indx_t)upper; + + /* Write the node data. */ + MDBX_node *node = page_node(mp, indx); + node_set_ks(node, key->iov_len); + node_set_flags(node, (uint8_t)flags); + UNALIGNED_POKE_8(node, MDBX_node, mn_extra, 0); + node_set_ds(node, data->iov_len); + memcpy(node_key(node), key->iov_base, key->iov_len); + + void *nodedata = node_data(node); + if (likely(largepage == NULL)) { + if (unlikely(flags & F_BIGDATA)) + memcpy(nodedata, data->iov_base, sizeof(pgno_t)); + else if (unlikely(flags & MDBX_RESERVE)) + data->iov_base = nodedata; + else if (likely(nodedata != data->iov_base && + data->iov_len /* to avoid UBSAN traps*/ != 0)) + memcpy(nodedata, data->iov_base, data->iov_len); + } else { + poke_pgno(nodedata, largepage->mp_pgno); + nodedata = page_data(largepage); + if (unlikely(flags & MDBX_RESERVE)) + data->iov_base = nodedata; + else if (likely(nodedata != data->iov_base && + data->iov_len /* to avoid UBSAN traps*/ != 0)) + memcpy(nodedata, data->iov_base, data->iov_len); + } + return MDBX_SUCCESS; +} + +/* Delete the specified node from a page. + * [in] mc Cursor pointing to the node to delete. + * [in] ksize The size of a node. Only used if the page is + * part of a MDBX_DUPFIXED database. */ +static void mdbx_node_del(MDBX_cursor *mc, size_t ksize) { + MDBX_page *mp = mc->mc_pg[mc->mc_top]; + int indx = mc->mc_ki[mc->mc_top]; + int i, j, nkeys, ptr; + MDBX_node *node; + char *base; + + mdbx_debug("delete node %u on %s page %" PRIaPGNO, indx, + IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno); + nkeys = page_numkeys(mp); + mdbx_cassert(mc, indx < nkeys); + + if (IS_LEAF2(mp)) { + mdbx_cassert(mc, ksize >= sizeof(indx_t)); + unsigned diff = nkeys - 1 - indx; + base = page_leaf2key(mp, indx, ksize); + if (diff) + memmove(base, base + ksize, diff * ksize); + mdbx_cassert(mc, mp->mp_lower >= sizeof(indx_t)); + mp->mp_lower -= sizeof(indx_t); + mdbx_cassert(mc, + (size_t)UINT16_MAX - mp->mp_upper >= ksize - sizeof(indx_t)); + mp->mp_upper += (indx_t)(ksize - sizeof(indx_t)); + return; + } + + node = page_node(mp, indx); + mdbx_cassert(mc, !IS_BRANCH(mp) || indx || node_ks(node) == 0); + size_t sz = NODESIZE + node_ks(node); + if (IS_LEAF(mp)) { + if (F_ISSET(node_flags(node), F_BIGDATA)) + sz += sizeof(pgno_t); + else + sz += node_ds(node); + } + sz = EVEN(sz); + + ptr = mp->mp_ptrs[indx]; + for (i = j = 0; i < nkeys; i++) { + if (i != indx) { + mp->mp_ptrs[j] = mp->mp_ptrs[i]; + if (mp->mp_ptrs[i] < ptr) { + mdbx_cassert(mc, (size_t)UINT16_MAX - mp->mp_ptrs[j] >= sz); + mp->mp_ptrs[j] += (indx_t)sz; + } + j++; + } + } + + base = (char *)mp + mp->mp_upper + PAGEHDRSZ; + memmove(base + sz, base, ptr - mp->mp_upper); + + mdbx_cassert(mc, mp->mp_lower >= sizeof(indx_t)); + mp->mp_lower -= sizeof(indx_t); + mdbx_cassert(mc, (size_t)UINT16_MAX - mp->mp_upper >= sz); + mp->mp_upper += (indx_t)sz; + +#if MDBX_DEBUG > 0 + if (mdbx_audit_enabled()) { + int page_check_err = mdbx_page_check(mc, mp, C_UPDATING); + mdbx_cassert(mc, page_check_err == MDBX_SUCCESS); + } +#endif +} + +/* Compact the main page after deleting a node on a subpage. + * [in] mp The main page to operate on. + * [in] indx The index of the subpage on the main page. */ +static void mdbx_node_shrink(MDBX_page *mp, unsigned indx) { + MDBX_node *node; + MDBX_page *sp, *xp; + char *base; + size_t nsize, delta, len, ptr; + int i; + + node = page_node(mp, indx); + sp = (MDBX_page *)node_data(node); + delta = page_room(sp); + assert(delta > 0); + + /* Prepare to shift upward, set len = length(subpage part to shift) */ + if (IS_LEAF2(sp)) { + delta &= /* do not make the node uneven-sized */ ~(size_t)1; + if (unlikely(delta) == 0) + return; + nsize = node_ds(node) - delta; + assert(nsize % 1 == 0); + len = nsize; + } else { + xp = (MDBX_page *)((char *)sp + delta); /* destination subpage */ + for (i = page_numkeys(sp); --i >= 0;) { + assert(sp->mp_ptrs[i] >= delta); + xp->mp_ptrs[i] = (indx_t)(sp->mp_ptrs[i] - delta); + } + nsize = node_ds(node) - delta; + len = PAGEHDRSZ; + } + sp->mp_upper = sp->mp_lower; + sp->mp_pgno = mp->mp_pgno; + node_set_ds(node, nsize); + + /* Shift upward */ + base = (char *)mp + mp->mp_upper + PAGEHDRSZ; + memmove(base + delta, base, (char *)sp + len - base); + + ptr = mp->mp_ptrs[indx]; + for (i = page_numkeys(mp); --i >= 0;) { + if (mp->mp_ptrs[i] <= ptr) { + assert((size_t)UINT16_MAX - mp->mp_ptrs[i] >= delta); + mp->mp_ptrs[i] += (indx_t)delta; + } + } + assert((size_t)UINT16_MAX - mp->mp_upper >= delta); + mp->mp_upper += (indx_t)delta; +} + +/* Initial setup of a sorted-dups cursor. + * + * Sorted duplicates are implemented as a sub-database for the given key. + * The duplicate data items are actually keys of the sub-database. + * Operations on the duplicate data items are performed using a sub-cursor + * initialized when the sub-database is first accessed. This function does + * the preliminary setup of the sub-cursor, filling in the fields that + * depend only on the parent DB. + * + * [in] mc The main cursor whose sorted-dups cursor is to be initialized. */ +static int mdbx_xcursor_init0(MDBX_cursor *mc) { + MDBX_xcursor *mx = mc->mc_xcursor; + if (!MDBX_DISABLE_PAGECHECKS && unlikely(mx == nullptr)) { + mdbx_error("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", + mc->mc_dbi); + return MDBX_CORRUPTED; + } + + mx->mx_cursor.mc_xcursor = NULL; + mx->mx_cursor.mc_next = NULL; + mx->mx_cursor.mc_txn = mc->mc_txn; + mx->mx_cursor.mc_db = &mx->mx_db; + mx->mx_cursor.mc_dbx = &mx->mx_dbx; + mx->mx_cursor.mc_dbi = mc->mc_dbi; + mx->mx_cursor.mc_dbistate = mc->mc_dbistate; + mx->mx_cursor.mc_snum = 0; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_SUB | (mc->mc_flags & (C_COPYING | C_SKIPORD)); + mx->mx_dbx.md_name.iov_len = 0; + mx->mx_dbx.md_name.iov_base = NULL; + mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; + mx->mx_dbx.md_dcmp = NULL; + mx->mx_dbx.md_klen_min = INT_MAX; + mx->mx_dbx.md_vlen_min = mx->mx_dbx.md_klen_max = mx->mx_dbx.md_vlen_max = 0; + return MDBX_SUCCESS; +} + +/* Final setup of a sorted-dups cursor. + * Sets up the fields that depend on the data from the main cursor. + * [in] mc The main cursor whose sorted-dups cursor is to be initialized. + * [in] node The data containing the MDBX_db record for the sorted-dup database. + */ +static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, + const MDBX_page *mp) { + MDBX_xcursor *mx = mc->mc_xcursor; + if (!MDBX_DISABLE_PAGECHECKS && unlikely(mx == nullptr)) { + mdbx_error("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", + mc->mc_dbi); + return MDBX_CORRUPTED; + } + + const uint8_t flags = node_flags(node); + switch (flags) { + default: + mdbx_error("invalid node flags %u", flags); + return MDBX_CORRUPTED; + case F_DUPDATA | F_SUBDATA: + if (!MDBX_DISABLE_PAGECHECKS && + unlikely(node_ds(node) != sizeof(MDBX_db))) { + mdbx_error("invalid nested-db record size %zu", node_ds(node)); + return MDBX_CORRUPTED; + } + memcpy(&mx->mx_db, node_data(node), sizeof(MDBX_db)); + const txnid_t pp_txnid = mp->mp_txnid; + if (!MDBX_DISABLE_PAGECHECKS && + unlikely(mx->mx_db.md_mod_txnid > pp_txnid)) { + mdbx_error("nested-db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN + ")", + mx->mx_db.md_mod_txnid, pp_txnid); + return MDBX_CORRUPTED; + } + mx->mx_cursor.mc_pg[0] = 0; + mx->mx_cursor.mc_snum = 0; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_SUB | (mc->mc_flags & (C_COPYING | C_SKIPORD)); + break; + case F_DUPDATA: + if (!MDBX_DISABLE_PAGECHECKS && unlikely(node_ds(node) <= PAGEHDRSZ)) { + mdbx_error("invalid nested-page size %zu", node_ds(node)); + return MDBX_CORRUPTED; + } + MDBX_page *fp = node_data(node); + mx->mx_db.md_depth = 1; + mx->mx_db.md_branch_pages = 0; + mx->mx_db.md_leaf_pages = 1; + mx->mx_db.md_overflow_pages = 0; + mx->mx_db.md_entries = page_numkeys(fp); + mx->mx_db.md_root = fp->mp_pgno; + mx->mx_db.md_mod_txnid = mp->mp_txnid; + mx->mx_cursor.mc_snum = 1; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = + C_INITIALIZED | C_SUB | (mc->mc_flags & (C_COPYING | C_SKIPORD)); + mx->mx_cursor.mc_pg[0] = fp; + mx->mx_cursor.mc_ki[0] = 0; + mx->mx_db.md_flags = flags_db2sub(mc->mc_db->md_flags); + mx->mx_db.md_xsize = + (mc->mc_db->md_flags & MDBX_DUPFIXED) ? fp->mp_leaf2_ksize : 0; + break; + } + + if (unlikely(mx->mx_db.md_xsize != mc->mc_db->md_xsize)) { + if (!MDBX_DISABLE_PAGECHECKS && unlikely(mc->mc_db->md_xsize != 0)) { + mdbx_error("cursor mismatched nested-db md_xsize %u", + mc->mc_db->md_xsize); + return MDBX_CORRUPTED; + } + if (!MDBX_DISABLE_PAGECHECKS && + unlikely((mc->mc_db->md_flags & MDBX_DUPFIXED) == 0)) { + mdbx_error("mismatched nested-db md_flags %u", mc->mc_db->md_flags); + return MDBX_CORRUPTED; + } + if (!MDBX_DISABLE_PAGECHECKS && + unlikely(mx->mx_db.md_xsize < mc->mc_dbx->md_vlen_min || + mx->mx_db.md_xsize > mc->mc_dbx->md_vlen_max)) { + mdbx_error("mismatched nested-db.md_xsize (%u) <> min/max value-length " + "(%zu/%zu)", + mx->mx_db.md_xsize, mc->mc_dbx->md_vlen_min, + mc->mc_dbx->md_vlen_max); + return MDBX_CORRUPTED; + } + mc->mc_db->md_xsize = mx->mx_db.md_xsize; + mc->mc_dbx->md_vlen_min = mc->mc_dbx->md_vlen_max = mx->mx_db.md_xsize; + } + mx->mx_dbx.md_klen_min = mc->mc_dbx->md_vlen_min; + mx->mx_dbx.md_klen_max = mc->mc_dbx->md_vlen_max; + + mdbx_debug("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi, + mx->mx_db.md_root); + return MDBX_SUCCESS; +} + +/* Fixup a sorted-dups cursor due to underlying update. + * Sets up some fields that depend on the data from the main cursor. + * Almost the same as init1, but skips initialization steps if the + * xcursor had already been used. + * [in] mc The main cursor whose sorted-dups cursor is to be fixed up. + * [in] src_mx The xcursor of an up-to-date cursor. + * [in] new_dupdata True if converting from a non-F_DUPDATA item. */ +static int mdbx_xcursor_init2(MDBX_cursor *mc, MDBX_xcursor *src_mx, + bool new_dupdata) { + MDBX_xcursor *mx = mc->mc_xcursor; + if (!MDBX_DISABLE_PAGECHECKS && unlikely(mx == nullptr)) { + mdbx_error("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", + mc->mc_dbi); + return MDBX_CORRUPTED; + } + + if (new_dupdata) { + mx->mx_cursor.mc_snum = 1; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags |= C_INITIALIZED; + mx->mx_cursor.mc_ki[0] = 0; + } + + mx->mx_dbx.md_klen_min = src_mx->mx_dbx.md_klen_min; + mx->mx_dbx.md_klen_max = src_mx->mx_dbx.md_klen_max; + mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; + mx->mx_db = src_mx->mx_db; + mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; + if (mx->mx_cursor.mc_flags & C_INITIALIZED) { + mdbx_debug("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi, + mx->mx_db.md_root); + } + return MDBX_SUCCESS; +} + +static __inline int mdbx_couple_init(MDBX_cursor_couple *couple, + const MDBX_dbi dbi, MDBX_txn *const txn, + MDBX_db *const db, MDBX_dbx *const dbx, + uint8_t *const dbstate) { + couple->outer.mc_signature = MDBX_MC_LIVE; + couple->outer.mc_next = NULL; + couple->outer.mc_backup = NULL; + couple->outer.mc_dbi = dbi; + couple->outer.mc_txn = txn; + couple->outer.mc_db = db; + couple->outer.mc_dbx = dbx; + couple->outer.mc_dbistate = dbstate; + couple->outer.mc_snum = 0; + couple->outer.mc_top = 0; + couple->outer.mc_pg[0] = 0; + couple->outer.mc_flags = 0; + couple->outer.mc_ki[0] = 0; + couple->outer.mc_xcursor = NULL; + + int rc = MDBX_SUCCESS; + if (unlikely(*couple->outer.mc_dbistate & DBI_STALE)) { + rc = mdbx_page_search(&couple->outer, NULL, MDBX_PS_ROOTONLY); + rc = (rc != MDBX_NOTFOUND) ? rc : MDBX_SUCCESS; + } else if (unlikely(couple->outer.mc_dbx->md_klen_max == 0)) { + rc = mdbx_setup_dbx(couple->outer.mc_dbx, couple->outer.mc_db, + txn->mt_env->me_psize); + } + + if (couple->outer.mc_db->md_flags & MDBX_DUPSORT) { + couple->inner.mx_cursor.mc_signature = MDBX_MC_LIVE; + couple->outer.mc_xcursor = &couple->inner; + rc = mdbx_xcursor_init0(&couple->outer); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + couple->inner.mx_dbx.md_klen_min = couple->outer.mc_dbx->md_vlen_min; + couple->inner.mx_dbx.md_klen_max = couple->outer.mc_dbx->md_vlen_max; + } + return rc; +} + +/* Initialize a cursor for a given transaction and database. */ +static int mdbx_cursor_init(MDBX_cursor *mc, MDBX_txn *txn, MDBX_dbi dbi) { + STATIC_ASSERT(offsetof(MDBX_cursor_couple, outer) == 0); + if (unlikely(TXN_DBI_CHANGED(txn, dbi))) + return MDBX_BAD_DBI; + + return mdbx_couple_init(container_of(mc, MDBX_cursor_couple, outer), dbi, txn, + &txn->mt_dbs[dbi], &txn->mt_dbxs[dbi], + &txn->mt_dbistate[dbi]); +} + +MDBX_cursor *mdbx_cursor_create(void *context) { + MDBX_cursor_couple *couple = mdbx_calloc(1, sizeof(MDBX_cursor_couple)); + if (unlikely(!couple)) + return nullptr; + + couple->outer.mc_signature = MDBX_MC_READY4CLOSE; + couple->outer.mc_dbi = UINT_MAX; + couple->mc_userctx = context; + return &couple->outer; +} + +int mdbx_cursor_set_userctx(MDBX_cursor *mc, void *ctx) { + if (unlikely(!mc)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE && + mc->mc_signature != MDBX_MC_LIVE)) + return MDBX_EBADSIGN; + + MDBX_cursor_couple *couple = container_of(mc, MDBX_cursor_couple, outer); + couple->mc_userctx = ctx; + return MDBX_SUCCESS; +} + +void *mdbx_cursor_get_userctx(const MDBX_cursor *mc) { + if (unlikely(!mc)) + return nullptr; + + if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE && + mc->mc_signature != MDBX_MC_LIVE)) + return nullptr; + + MDBX_cursor_couple *couple = container_of(mc, MDBX_cursor_couple, outer); + return couple->mc_userctx; +} + +int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { + if (unlikely(!mc)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE && + mc->mc_signature != MDBX_MC_LIVE)) + return MDBX_EBADSIGN; + + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!check_dbi(txn, dbi, DBI_VALID))) + return MDBX_BAD_DBI; + + if (unlikely(dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) + return MDBX_EACCESS; + + if (unlikely(mc->mc_backup)) /* Cursor from parent transaction */ { + mdbx_cassert(mc, mc->mc_signature == MDBX_MC_LIVE); + if (unlikely(mc->mc_dbi != dbi || + /* paranoia */ mc->mc_signature != MDBX_MC_LIVE || + mc->mc_txn != txn)) + return MDBX_EINVAL; + + assert(mc->mc_db == &txn->mt_dbs[dbi]); + assert(mc->mc_dbx == &txn->mt_dbxs[dbi]); + assert(mc->mc_dbi == dbi); + assert(mc->mc_dbistate == &txn->mt_dbistate[dbi]); + return likely(mc->mc_dbi == dbi && + /* paranoia */ mc->mc_signature == MDBX_MC_LIVE && + mc->mc_txn == txn) + ? MDBX_SUCCESS + : MDBX_EINVAL /* Disallow change DBI in nested transactions */; + } + + if (mc->mc_signature == MDBX_MC_LIVE) { + if (unlikely(!mc->mc_txn || + mc->mc_txn->mt_signature != MDBX_MT_SIGNATURE)) { + mdbx_error("Wrong cursor's transaction %p 0x%x", + __Wpedantic_format_voidptr(mc->mc_txn), + mc->mc_txn ? mc->mc_txn->mt_signature : 0); + return MDBX_PROBLEM; + } + if (mc->mc_flags & C_UNTRACK) { + mdbx_cassert(mc, !(mc->mc_txn->mt_flags & MDBX_TXN_RDONLY)); + MDBX_cursor **prev = &mc->mc_txn->tw.cursors[mc->mc_dbi]; + while (*prev && *prev != mc) + prev = &(*prev)->mc_next; + mdbx_cassert(mc, *prev == mc); + *prev = mc->mc_next; + } + mc->mc_signature = MDBX_MC_READY4CLOSE; + mc->mc_flags = 0; + mc->mc_dbi = UINT_MAX; + mc->mc_next = NULL; + mc->mc_db = NULL; + mc->mc_dbx = NULL; + mc->mc_dbistate = NULL; + } + mdbx_cassert(mc, !(mc->mc_flags & C_UNTRACK)); + + rc = mdbx_cursor_init(mc, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (!(txn->mt_flags & MDBX_TXN_RDONLY)) { + mc->mc_next = txn->tw.cursors[dbi]; + txn->tw.cursors[dbi] = mc; + mc->mc_flags |= C_UNTRACK; + } + + return MDBX_SUCCESS; +} + +int mdbx_cursor_open(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cursor **ret) { + if (unlikely(!ret)) + return MDBX_EINVAL; + *ret = NULL; + + MDBX_cursor *const mc = mdbx_cursor_create(nullptr); + if (unlikely(!mc)) + return MDBX_ENOMEM; + + int rc = mdbx_cursor_bind(txn, mc, dbi); + if (unlikely(rc != MDBX_SUCCESS)) { + mdbx_cursor_close(mc); + return rc; + } + + *ret = mc; + return MDBX_SUCCESS; +} + +int mdbx_cursor_renew(MDBX_txn *txn, MDBX_cursor *mc) { + return likely(mc) ? mdbx_cursor_bind(txn, mc, mc->mc_dbi) : MDBX_EINVAL; +} + +int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest) { + if (unlikely(!src)) + return MDBX_EINVAL; + if (unlikely(src->mc_signature != MDBX_MC_LIVE)) + return (src->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + int rc = mdbx_cursor_bind(src->mc_txn, dest, src->mc_dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + assert(dest->mc_db == src->mc_db); + assert(dest->mc_dbi == src->mc_dbi); + assert(dest->mc_dbx == src->mc_dbx); + assert(dest->mc_dbistate == src->mc_dbistate); +again: + assert(dest->mc_txn == src->mc_txn); + dest->mc_flags ^= (dest->mc_flags ^ src->mc_flags) & ~C_UNTRACK; + dest->mc_top = src->mc_top; + dest->mc_snum = src->mc_snum; + for (unsigned i = 0; i < src->mc_snum; ++i) { + dest->mc_ki[i] = src->mc_ki[i]; + dest->mc_pg[i] = src->mc_pg[i]; + } + + if (src->mc_xcursor) { + dest->mc_xcursor->mx_db = src->mc_xcursor->mx_db; + dest->mc_xcursor->mx_dbx = src->mc_xcursor->mx_dbx; + src = &src->mc_xcursor->mx_cursor; + dest = &dest->mc_xcursor->mx_cursor; + goto again; + } + + return MDBX_SUCCESS; +} + +void mdbx_cursor_close(MDBX_cursor *mc) { + if (likely(mc)) { + mdbx_ensure(NULL, mc->mc_signature == MDBX_MC_LIVE || + mc->mc_signature == MDBX_MC_READY4CLOSE); + MDBX_txn *const txn = mc->mc_txn; + if (!mc->mc_backup) { + mc->mc_txn = NULL; + /* Remove from txn, if tracked. + * A read-only txn (!C_UNTRACK) may have been freed already, + * so do not peek inside it. Only write txns track cursors. */ + if (mc->mc_flags & C_UNTRACK) { + mdbx_ensure(txn->mt_env, check_txn_rw(txn, 0) == MDBX_SUCCESS); + MDBX_cursor **prev = &txn->tw.cursors[mc->mc_dbi]; + while (*prev && *prev != mc) + prev = &(*prev)->mc_next; + mdbx_tassert(txn, *prev == mc); + *prev = mc->mc_next; + } + mc->mc_signature = 0; + mc->mc_next = mc; + mdbx_free(mc); + } else { + /* Cursor closed before nested txn ends */ + mdbx_tassert(txn, mc->mc_signature == MDBX_MC_LIVE); + mdbx_ensure(txn->mt_env, check_txn_rw(txn, 0) == MDBX_SUCCESS); + mc->mc_signature = MDBX_MC_WAIT4EOT; + } + } +} + +MDBX_txn *mdbx_cursor_txn(const MDBX_cursor *mc) { + if (unlikely(!mc || mc->mc_signature != MDBX_MC_LIVE)) + return NULL; + MDBX_txn *txn = mc->mc_txn; + if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) + return NULL; + if (unlikely(txn->mt_flags & MDBX_TXN_FINISHED)) + return NULL; + return txn; +} + +MDBX_dbi mdbx_cursor_dbi(const MDBX_cursor *mc) { + if (unlikely(!mc || mc->mc_signature != MDBX_MC_LIVE)) + return UINT_MAX; + return mc->mc_dbi; +} + +/* Return the count of duplicate data items for the current key */ +int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) { + if (unlikely(mc == NULL)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(countp == NULL || !(mc->mc_flags & C_INITIALIZED))) + return MDBX_EINVAL; + + if (!mc->mc_snum) { + *countp = 0; + return MDBX_NOTFOUND; + } + + MDBX_page *mp = mc->mc_pg[mc->mc_top]; + if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= page_numkeys(mp)) { + *countp = 0; + return MDBX_NOTFOUND; + } + + *countp = 1; + if (mc->mc_xcursor != NULL) { + MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { + mdbx_cassert(mc, mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & + C_INITIALIZED)); + *countp = unlikely(mc->mc_xcursor->mx_db.md_entries > PTRDIFF_MAX) + ? PTRDIFF_MAX + : (size_t)mc->mc_xcursor->mx_db.md_entries; + } + } + return MDBX_SUCCESS; +} + +/* Replace the key for a branch node with a new key. + * Set MDBX_TXN_ERROR on failure. + * [in] mc Cursor pointing to the node to operate on. + * [in] key The new key to use. + * Returns 0 on success, non-zero on failure. */ +static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { + MDBX_page *mp; + MDBX_node *node; + char *base; + size_t len; + int delta, ksize, oksize; + int ptr, i, nkeys, indx; + DKBUF_DEBUG; + + mdbx_cassert(mc, cursor_is_tracked(mc)); + indx = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + node = page_node(mp, indx); + ptr = mp->mp_ptrs[indx]; +#if MDBX_DEBUG + MDBX_val k2; + k2.iov_base = node_key(node); + k2.iov_len = node_ks(node); + mdbx_debug("update key %u (offset %u) [%s] to [%s] on page %" PRIaPGNO, indx, + ptr, DVAL_DEBUG(&k2), DKEY_DEBUG(key), mp->mp_pgno); +#endif /* MDBX_DEBUG */ + + /* Sizes must be 2-byte aligned. */ + ksize = EVEN(key->iov_len); + oksize = EVEN(node_ks(node)); + delta = ksize - oksize; + + /* Shift node contents if EVEN(key length) changed. */ + if (delta) { + if (delta > (int)page_room(mp)) { + /* not enough space left, do a delete and split */ + mdbx_debug("Not enough room, delta = %d, splitting...", delta); + pgno_t pgno = node_pgno(node); + mdbx_node_del(mc, 0); + int rc = mdbx_page_split(mc, key, NULL, pgno, MDBX_SPLIT_REPLACE); + if (rc == MDBX_SUCCESS && mdbx_audit_enabled()) + rc = mdbx_cursor_check(mc, C_UPDATING); + return rc; + } + + nkeys = page_numkeys(mp); + for (i = 0; i < nkeys; i++) { + if (mp->mp_ptrs[i] <= ptr) { + mdbx_cassert(mc, mp->mp_ptrs[i] >= delta); + mp->mp_ptrs[i] -= (indx_t)delta; + } + } + + base = (char *)mp + mp->mp_upper + PAGEHDRSZ; + len = ptr - mp->mp_upper + NODESIZE; + memmove(base - delta, base, len); + mdbx_cassert(mc, mp->mp_upper >= delta); + mp->mp_upper -= (indx_t)delta; + + node = page_node(mp, indx); + } + + /* But even if no shift was needed, update ksize */ + node_set_ks(node, key->iov_len); + + if (likely(key->iov_len /* to avoid UBSAN traps*/ != 0)) + memcpy(node_key(node), key->iov_base, key->iov_len); + return MDBX_SUCCESS; +} + +/* Move a node from csrc to cdst. */ +static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { + int rc; + DKBUF_DEBUG; + + MDBX_page *psrc = csrc->mc_pg[csrc->mc_top]; + MDBX_page *pdst = cdst->mc_pg[cdst->mc_top]; + mdbx_cassert(csrc, PAGETYPE(psrc) == PAGETYPE(pdst)); + mdbx_cassert(csrc, csrc->mc_dbi == cdst->mc_dbi); + mdbx_cassert(csrc, csrc->mc_top == cdst->mc_top); + if (unlikely(PAGETYPE(psrc) != PAGETYPE(pdst))) { + bailout: + mdbx_error("Wrong or mismatch pages's types (src %d, dst %d) to move node", + PAGETYPE(psrc), PAGETYPE(pdst)); + csrc->mc_txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_PROBLEM; + } + + MDBX_val key4move; + switch (PAGETYPE(psrc)) { + case P_BRANCH: { + const MDBX_node *srcnode = page_node(psrc, csrc->mc_ki[csrc->mc_top]); + mdbx_cassert(csrc, node_flags(srcnode) == 0); + const pgno_t srcpg = node_pgno(srcnode); + key4move.iov_len = node_ks(srcnode); + key4move.iov_base = node_key(srcnode); + + if (csrc->mc_ki[csrc->mc_top] == 0) { + const unsigned snum = csrc->mc_snum; + mdbx_cassert(csrc, snum > 0); + /* must find the lowest key below src */ + rc = mdbx_page_search_lowest(csrc); + MDBX_page *lowest_page = csrc->mc_pg[csrc->mc_top]; + if (unlikely(rc)) + return rc; + mdbx_cassert(csrc, IS_LEAF(lowest_page)); + if (unlikely(!IS_LEAF(lowest_page))) + goto bailout; + if (IS_LEAF2(lowest_page)) { + key4move.iov_len = csrc->mc_db->md_xsize; + key4move.iov_base = page_leaf2key(lowest_page, 0, key4move.iov_len); + } else { + const MDBX_node *lowest_node = page_node(lowest_page, 0); + key4move.iov_len = node_ks(lowest_node); + key4move.iov_base = node_key(lowest_node); + } + + /* restore cursor after mdbx_page_search_lowest() */ + csrc->mc_snum = snum; + csrc->mc_top = snum - 1; + csrc->mc_ki[csrc->mc_top] = 0; + + /* paranoia */ + mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); + mdbx_cassert(csrc, IS_BRANCH(psrc)); + if (unlikely(!IS_BRANCH(psrc))) + goto bailout; + } + + if (cdst->mc_ki[cdst->mc_top] == 0) { + const unsigned snum = cdst->mc_snum; + mdbx_cassert(csrc, snum > 0); + MDBX_cursor mn; + cursor_copy(cdst, &mn); + /* must find the lowest key below dst */ + rc = mdbx_page_search_lowest(&mn); + if (unlikely(rc)) + return rc; + MDBX_page *const lowest_page = mn.mc_pg[mn.mc_top]; + mdbx_cassert(cdst, IS_LEAF(lowest_page)); + if (unlikely(!IS_LEAF(lowest_page))) + goto bailout; + MDBX_val key; + if (IS_LEAF2(lowest_page)) { + key.iov_len = mn.mc_db->md_xsize; + key.iov_base = page_leaf2key(lowest_page, 0, key.iov_len); + } else { + MDBX_node *lowest_node = page_node(lowest_page, 0); + key.iov_len = node_ks(lowest_node); + key.iov_base = node_key(lowest_node); + } + + /* restore cursor after mdbx_page_search_lowest() */ + mn.mc_snum = snum; + mn.mc_top = snum - 1; + mn.mc_ki[mn.mc_top] = 0; + + const intptr_t delta = + EVEN(key.iov_len) - EVEN(node_ks(page_node(mn.mc_pg[mn.mc_top], 0))); + const intptr_t needed = + branch_size(cdst->mc_txn->mt_env, &key4move) + delta; + const intptr_t have = page_room(pdst); + if (unlikely(needed > have)) + return MDBX_RESULT_TRUE; + + if (unlikely((rc = mdbx_page_touch(csrc)) || + (rc = mdbx_page_touch(cdst)))) + return rc; + psrc = csrc->mc_pg[csrc->mc_top]; + pdst = cdst->mc_pg[cdst->mc_top]; + + WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key)); + if (unlikely(rc)) + return rc; + } else { + const size_t needed = branch_size(cdst->mc_txn->mt_env, &key4move); + const size_t have = page_room(pdst); + if (unlikely(needed > have)) + return MDBX_RESULT_TRUE; + + if (unlikely((rc = mdbx_page_touch(csrc)) || + (rc = mdbx_page_touch(cdst)))) + return rc; + psrc = csrc->mc_pg[csrc->mc_top]; + pdst = cdst->mc_pg[cdst->mc_top]; + } + + mdbx_debug("moving %s-node %u [%s] on page %" PRIaPGNO + " to node %u on page %" PRIaPGNO, + "branch", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), + psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); + /* Add the node to the destination page. */ + rc = + mdbx_node_add_branch(cdst, cdst->mc_ki[cdst->mc_top], &key4move, srcpg); + } break; + + case P_LEAF: { + /* Mark src and dst as dirty. */ + if (unlikely((rc = mdbx_page_touch(csrc)) || (rc = mdbx_page_touch(cdst)))) + return rc; + psrc = csrc->mc_pg[csrc->mc_top]; + pdst = cdst->mc_pg[cdst->mc_top]; + const MDBX_node *srcnode = page_node(psrc, csrc->mc_ki[csrc->mc_top]); + MDBX_val data; + data.iov_len = node_ds(srcnode); + data.iov_base = node_data(srcnode); + key4move.iov_len = node_ks(srcnode); + key4move.iov_base = node_key(srcnode); + mdbx_debug("moving %s-node %u [%s] on page %" PRIaPGNO + " to node %u on page %" PRIaPGNO, + "leaf", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), + psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); + /* Add the node to the destination page. */ + rc = mdbx_node_add_leaf(cdst, cdst->mc_ki[cdst->mc_top], &key4move, &data, + node_flags(srcnode)); + } break; + + case P_LEAF | P_LEAF2: { + /* Mark src and dst as dirty. */ + if (unlikely((rc = mdbx_page_touch(csrc)) || (rc = mdbx_page_touch(cdst)))) + return rc; + psrc = csrc->mc_pg[csrc->mc_top]; + pdst = cdst->mc_pg[cdst->mc_top]; + key4move.iov_len = csrc->mc_db->md_xsize; + key4move.iov_base = + page_leaf2key(psrc, csrc->mc_ki[csrc->mc_top], key4move.iov_len); + mdbx_debug("moving %s-node %u [%s] on page %" PRIaPGNO + " to node %u on page %" PRIaPGNO, + "leaf2", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), + psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); + /* Add the node to the destination page. */ + rc = mdbx_node_add_leaf2(cdst, cdst->mc_ki[cdst->mc_top], &key4move); + } break; + + default: + goto bailout; + } + + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + /* Delete the node from the source page. */ + mdbx_node_del(csrc, key4move.iov_len); + + mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); + mdbx_cassert(cdst, pdst == cdst->mc_pg[cdst->mc_top]); + mdbx_cassert(csrc, PAGETYPE(psrc) == PAGETYPE(pdst)); + + { + /* Adjust other cursors pointing to mp */ + MDBX_cursor *m2, *m3; + const MDBX_dbi dbi = csrc->mc_dbi; + mdbx_cassert(csrc, csrc->mc_top == cdst->mc_top); + if (fromleft) { + /* If we're adding on the left, bump others up */ + for (m2 = csrc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) { + m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) + continue; + if (m3 != cdst && m3->mc_pg[csrc->mc_top] == pdst && + m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) { + m3->mc_ki[csrc->mc_top]++; + } + if (m3 != csrc && m3->mc_pg[csrc->mc_top] == psrc && + m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { + m3->mc_pg[csrc->mc_top] = pdst; + m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; + mdbx_cassert(csrc, csrc->mc_top > 0); + m3->mc_ki[csrc->mc_top - 1]++; + } + if (XCURSOR_INITED(m3) && IS_LEAF(psrc)) + XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); + } + } else { + /* Adding on the right, bump others down */ + for (m2 = csrc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) { + m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (m3 == csrc) + continue; + if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) + continue; + if (m3->mc_pg[csrc->mc_top] == psrc) { + if (!m3->mc_ki[csrc->mc_top]) { + m3->mc_pg[csrc->mc_top] = pdst; + m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; + mdbx_cassert(csrc, csrc->mc_top > 0); + m3->mc_ki[csrc->mc_top - 1]--; + } else { + m3->mc_ki[csrc->mc_top]--; + } + if (XCURSOR_INITED(m3) && IS_LEAF(psrc)) + XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], + m3->mc_ki[csrc->mc_top]); + } + } + } + } + + /* Update the parent separators. */ + if (csrc->mc_ki[csrc->mc_top] == 0) { + mdbx_cassert(csrc, csrc->mc_top > 0); + if (csrc->mc_ki[csrc->mc_top - 1] != 0) { + MDBX_val key; + if (IS_LEAF2(psrc)) { + key.iov_len = psrc->mp_leaf2_ksize; + key.iov_base = page_leaf2key(psrc, 0, key.iov_len); + } else { + MDBX_node *srcnode = page_node(psrc, 0); + key.iov_len = node_ks(srcnode); + key.iov_base = node_key(srcnode); + } + mdbx_debug("update separator for source page %" PRIaPGNO " to [%s]", + psrc->mp_pgno, DKEY_DEBUG(&key)); + MDBX_cursor mn; + cursor_copy(csrc, &mn); + mdbx_cassert(csrc, mn.mc_snum > 0); + mn.mc_snum--; + mn.mc_top--; + /* We want mdbx_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key)); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + if (IS_BRANCH(psrc)) { + const MDBX_val nullkey = {0, 0}; + const indx_t ix = csrc->mc_ki[csrc->mc_top]; + csrc->mc_ki[csrc->mc_top] = 0; + rc = mdbx_update_key(csrc, &nullkey); + csrc->mc_ki[csrc->mc_top] = ix; + mdbx_cassert(csrc, rc == MDBX_SUCCESS); + } + } + + if (cdst->mc_ki[cdst->mc_top] == 0) { + mdbx_cassert(cdst, cdst->mc_top > 0); + if (cdst->mc_ki[cdst->mc_top - 1] != 0) { + MDBX_val key; + if (IS_LEAF2(pdst)) { + key.iov_len = pdst->mp_leaf2_ksize; + key.iov_base = page_leaf2key(pdst, 0, key.iov_len); + } else { + MDBX_node *srcnode = page_node(pdst, 0); + key.iov_len = node_ks(srcnode); + key.iov_base = node_key(srcnode); + } + mdbx_debug("update separator for destination page %" PRIaPGNO " to [%s]", + pdst->mp_pgno, DKEY_DEBUG(&key)); + MDBX_cursor mn; + cursor_copy(cdst, &mn); + mdbx_cassert(cdst, mn.mc_snum > 0); + mn.mc_snum--; + mn.mc_top--; + /* We want mdbx_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key)); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + if (IS_BRANCH(pdst)) { + const MDBX_val nullkey = {0, 0}; + const indx_t ix = cdst->mc_ki[cdst->mc_top]; + cdst->mc_ki[cdst->mc_top] = 0; + rc = mdbx_update_key(cdst, &nullkey); + cdst->mc_ki[cdst->mc_top] = ix; + mdbx_cassert(cdst, rc == MDBX_SUCCESS); + } + } + + return MDBX_SUCCESS; +} + +/* Merge one page into another. + * + * The nodes from the page pointed to by csrc will be copied to the page + * pointed to by cdst and then the csrc page will be freed. + * + * [in] csrc Cursor pointing to the source page. + * [in] cdst Cursor pointing to the destination page. + * + * Returns 0 on success, non-zero on failure. */ +static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { + MDBX_val key; + int rc; + + mdbx_cassert(csrc, csrc != cdst); + mdbx_cassert(csrc, cursor_is_tracked(csrc)); + mdbx_cassert(cdst, cursor_is_tracked(cdst)); + const MDBX_page *const psrc = csrc->mc_pg[csrc->mc_top]; + MDBX_page *pdst = cdst->mc_pg[cdst->mc_top]; + mdbx_debug("merging page %" PRIaPGNO " into %" PRIaPGNO, psrc->mp_pgno, + pdst->mp_pgno); + + mdbx_cassert(csrc, PAGETYPE(psrc) == PAGETYPE(pdst)); + mdbx_cassert(csrc, + csrc->mc_dbi == cdst->mc_dbi && csrc->mc_db == cdst->mc_db); + mdbx_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ + mdbx_cassert(cdst, cdst->mc_snum > 1); + mdbx_cassert(cdst, cdst->mc_snum < cdst->mc_db->md_depth || + IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); + mdbx_cassert(csrc, csrc->mc_snum < csrc->mc_db->md_depth || + IS_LEAF(csrc->mc_pg[csrc->mc_db->md_depth - 1])); + mdbx_cassert(cdst, page_room(pdst) >= page_used(cdst->mc_txn->mt_env, psrc)); + const int pagetype = PAGETYPE(psrc); + + /* Move all nodes from src to dst */ + const unsigned dst_nkeys = page_numkeys(pdst); + const unsigned src_nkeys = page_numkeys(psrc); + mdbx_cassert(cdst, dst_nkeys + src_nkeys >= (IS_LEAF(psrc) ? 1u : 2u)); + if (likely(src_nkeys)) { + unsigned j = dst_nkeys; + if (unlikely(pagetype & P_LEAF2)) { + /* Mark dst as dirty. */ + if (unlikely(rc = mdbx_page_touch(cdst))) + return rc; + + key.iov_len = csrc->mc_db->md_xsize; + key.iov_base = page_data(psrc); + unsigned i = 0; + do { + rc = mdbx_node_add_leaf2(cdst, j++, &key); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + key.iov_base = (char *)key.iov_base + key.iov_len; + } while (++i != src_nkeys); + } else { + MDBX_node *srcnode = page_node(psrc, 0); + key.iov_len = node_ks(srcnode); + key.iov_base = node_key(srcnode); + if (pagetype & P_BRANCH) { + MDBX_cursor mn; + cursor_copy(csrc, &mn); + /* must find the lowest key below src */ + rc = mdbx_page_search_lowest(&mn); + if (unlikely(rc)) + return rc; + + const MDBX_page *mp = mn.mc_pg[mn.mc_top]; + if (likely(!IS_LEAF2(mp))) { + mdbx_cassert(&mn, IS_LEAF(mp)); + const MDBX_node *lowest = page_node(mp, 0); + key.iov_len = node_ks(lowest); + key.iov_base = node_key(lowest); + } else { + mdbx_cassert(&mn, mn.mc_top > csrc->mc_top); + key.iov_len = mp->mp_leaf2_ksize; + key.iov_base = page_leaf2key(mp, mn.mc_ki[mn.mc_top], key.iov_len); + } + mdbx_cassert(&mn, key.iov_len >= csrc->mc_dbx->md_klen_min); + mdbx_cassert(&mn, key.iov_len <= csrc->mc_dbx->md_klen_max); + + const size_t dst_room = page_room(pdst); + const size_t src_used = page_used(cdst->mc_txn->mt_env, psrc); + const size_t space_needed = src_used - node_ks(srcnode) + key.iov_len; + if (unlikely(space_needed > dst_room)) + return MDBX_RESULT_TRUE; + } + + /* Mark dst as dirty. */ + if (unlikely(rc = mdbx_page_touch(cdst))) + return rc; + + unsigned i = 0; + while (true) { + if (pagetype & P_LEAF) { + MDBX_val data; + data.iov_len = node_ds(srcnode); + data.iov_base = node_data(srcnode); + rc = mdbx_node_add_leaf(cdst, j++, &key, &data, node_flags(srcnode)); + } else { + mdbx_cassert(csrc, node_flags(srcnode) == 0); + rc = mdbx_node_add_branch(cdst, j++, &key, node_pgno(srcnode)); + } + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (++i == src_nkeys) + break; + srcnode = page_node(psrc, i); + key.iov_len = node_ks(srcnode); + key.iov_base = node_key(srcnode); + } + } + + pdst = cdst->mc_pg[cdst->mc_top]; + mdbx_debug("dst page %" PRIaPGNO " now has %u keys (%.1f%% filled)", + pdst->mp_pgno, page_numkeys(pdst), + page_fill(cdst->mc_txn->mt_env, pdst)); + + mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); + mdbx_cassert(cdst, pdst == cdst->mc_pg[cdst->mc_top]); + } + + /* Unlink the src page from parent and add to free list. */ + csrc->mc_top--; + mdbx_node_del(csrc, 0); + if (csrc->mc_ki[csrc->mc_top] == 0) { + const MDBX_val nullkey = {0, 0}; + rc = mdbx_update_key(csrc, &nullkey); + if (unlikely(rc)) { + csrc->mc_top++; + return rc; + } + } + csrc->mc_top++; + + mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); + mdbx_cassert(cdst, pdst == cdst->mc_pg[cdst->mc_top]); + + { + /* Adjust other cursors pointing to mp */ + MDBX_cursor *m2, *m3; + const MDBX_dbi dbi = csrc->mc_dbi; + const unsigned top = csrc->mc_top; + + for (m2 = csrc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) { + m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (m3 == csrc || top >= m3->mc_snum) + continue; + if (m3->mc_pg[top] == psrc) { + m3->mc_pg[top] = pdst; + mdbx_cassert(m3, dst_nkeys + m3->mc_ki[top] <= UINT16_MAX); + m3->mc_ki[top] += (indx_t)dst_nkeys; + m3->mc_ki[top - 1] = cdst->mc_ki[top - 1]; + } else if (m3->mc_pg[top - 1] == csrc->mc_pg[top - 1] && + m3->mc_ki[top - 1] > csrc->mc_ki[top - 1]) { + m3->mc_ki[top - 1]--; + } + if (XCURSOR_INITED(m3) && IS_LEAF(psrc)) + XCURSOR_REFRESH(m3, m3->mc_pg[top], m3->mc_ki[top]); + } + } + + /* If not operating on GC, allow this page to be reused + * in this txn. Otherwise just add to free list. */ + rc = mdbx_page_retire(csrc, (MDBX_page *)psrc); + if (unlikely(rc)) + return rc; + + mdbx_cassert(cdst, cdst->mc_db->md_entries > 0); + mdbx_cassert(cdst, cdst->mc_snum <= cdst->mc_db->md_depth); + mdbx_cassert(cdst, cdst->mc_top > 0); + mdbx_cassert(cdst, cdst->mc_snum == cdst->mc_top + 1); + MDBX_page *const top_page = cdst->mc_pg[cdst->mc_top]; + const indx_t top_indx = cdst->mc_ki[cdst->mc_top]; + const unsigned save_snum = cdst->mc_snum; + const uint16_t save_depth = cdst->mc_db->md_depth; + mdbx_cursor_pop(cdst); + rc = mdbx_rebalance(cdst); + if (unlikely(rc)) + return rc; + + mdbx_cassert(cdst, cdst->mc_db->md_entries > 0); + mdbx_cassert(cdst, cdst->mc_snum <= cdst->mc_db->md_depth); + mdbx_cassert(cdst, cdst->mc_snum == cdst->mc_top + 1); + +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&cdst->mc_txn->mt_env->me_lck->mti_pgop_stat.merge, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ + + if (IS_LEAF(cdst->mc_pg[cdst->mc_top])) { + /* LY: don't touch cursor if top-page is a LEAF */ + mdbx_cassert(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || + PAGETYPE(cdst->mc_pg[cdst->mc_top]) == pagetype); + return MDBX_SUCCESS; + } + + mdbx_cassert(cdst, page_numkeys(top_page) == dst_nkeys + src_nkeys); + + if (unlikely(pagetype != PAGETYPE(top_page))) { + /* LY: LEAF-page becomes BRANCH, unable restore cursor's stack */ + goto bailout; + } + + if (top_page == cdst->mc_pg[cdst->mc_top]) { + /* LY: don't touch cursor if prev top-page already on the top */ + mdbx_cassert(cdst, cdst->mc_ki[cdst->mc_top] == top_indx); + mdbx_cassert(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || + PAGETYPE(cdst->mc_pg[cdst->mc_top]) == pagetype); + return MDBX_SUCCESS; + } + + const int new_snum = save_snum - save_depth + cdst->mc_db->md_depth; + if (unlikely(new_snum < 1 || new_snum > cdst->mc_db->md_depth)) { + /* LY: out of range, unable restore cursor's stack */ + goto bailout; + } + + if (top_page == cdst->mc_pg[new_snum - 1]) { + mdbx_cassert(cdst, cdst->mc_ki[new_snum - 1] == top_indx); + /* LY: restore cursor stack */ + cdst->mc_snum = (uint16_t)new_snum; + cdst->mc_top = (uint16_t)new_snum - 1; + mdbx_cassert(cdst, cdst->mc_snum < cdst->mc_db->md_depth || + IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); + mdbx_cassert(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || + PAGETYPE(cdst->mc_pg[cdst->mc_top]) == pagetype); + return MDBX_SUCCESS; + } + + MDBX_page *const stub_page = (MDBX_page *)(~(uintptr_t)top_page); + const indx_t stub_indx = top_indx; + if (save_depth > cdst->mc_db->md_depth && + ((cdst->mc_pg[save_snum - 1] == top_page && + cdst->mc_ki[save_snum - 1] == top_indx) || + (cdst->mc_pg[save_snum - 1] == stub_page && + cdst->mc_ki[save_snum - 1] == stub_indx))) { + /* LY: restore cursor stack */ + cdst->mc_pg[new_snum - 1] = top_page; + cdst->mc_ki[new_snum - 1] = top_indx; + cdst->mc_pg[new_snum] = (MDBX_page *)(~(uintptr_t)cdst->mc_pg[new_snum]); + cdst->mc_ki[new_snum] = ~cdst->mc_ki[new_snum]; + cdst->mc_snum = (uint16_t)new_snum; + cdst->mc_top = (uint16_t)new_snum - 1; + mdbx_cassert(cdst, cdst->mc_snum < cdst->mc_db->md_depth || + IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); + mdbx_cassert(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || + PAGETYPE(cdst->mc_pg[cdst->mc_top]) == pagetype); + return MDBX_SUCCESS; + } + +bailout: + /* LY: unable restore cursor's stack */ + cdst->mc_flags &= ~C_INITIALIZED; + return MDBX_CURSOR_FULL; +} + +static void cursor_restore(const MDBX_cursor *csrc, MDBX_cursor *cdst) { + mdbx_cassert(cdst, cdst->mc_dbi == csrc->mc_dbi); + mdbx_cassert(cdst, cdst->mc_txn == csrc->mc_txn); + mdbx_cassert(cdst, cdst->mc_db == csrc->mc_db); + mdbx_cassert(cdst, cdst->mc_dbx == csrc->mc_dbx); + mdbx_cassert(cdst, cdst->mc_dbistate == csrc->mc_dbistate); + cdst->mc_snum = csrc->mc_snum; + cdst->mc_top = csrc->mc_top; + cdst->mc_flags = csrc->mc_flags; + + for (unsigned i = 0; i < csrc->mc_snum; i++) { + cdst->mc_pg[i] = csrc->mc_pg[i]; + cdst->mc_ki[i] = csrc->mc_ki[i]; + } +} + +/* Copy the contents of a cursor. + * [in] csrc The cursor to copy from. + * [out] cdst The cursor to copy to. */ +static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { + mdbx_cassert(csrc, csrc->mc_txn->mt_txnid >= + csrc->mc_txn->mt_env->me_lck->mti_oldest_reader.weak); + cdst->mc_dbi = csrc->mc_dbi; + cdst->mc_next = NULL; + cdst->mc_backup = NULL; + cdst->mc_xcursor = NULL; + cdst->mc_txn = csrc->mc_txn; + cdst->mc_db = csrc->mc_db; + cdst->mc_dbx = csrc->mc_dbx; + cdst->mc_dbistate = csrc->mc_dbistate; + cursor_restore(csrc, cdst); +} + +/* Rebalance the tree after a delete operation. + * [in] mc Cursor pointing to the page where rebalancing should begin. + * Returns 0 on success, non-zero on failure. */ +static int mdbx_rebalance(MDBX_cursor *mc) { + mdbx_cassert(mc, cursor_is_tracked(mc)); + mdbx_cassert(mc, mc->mc_snum > 0); + mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth || + IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); + const int pagetype = PAGETYPE(mc->mc_pg[mc->mc_top]); + + STATIC_ASSERT(P_BRANCH == 1); + const unsigned minkeys = (pagetype & P_BRANCH) + 1; + + /* Pages emptier than this are candidates for merging. */ + unsigned room_threshold = likely(mc->mc_dbi != FREE_DBI) + ? mc->mc_txn->mt_env->me_merge_threshold + : mc->mc_txn->mt_env->me_merge_threshold_gc; + + const MDBX_page *const tp = mc->mc_pg[mc->mc_top]; + const unsigned numkeys = page_numkeys(tp); + const unsigned room = page_room(tp); + mdbx_debug("rebalancing %s page %" PRIaPGNO + " (has %u keys, full %.1f%%, used %u, room %u bytes )", + (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, + page_fill(mc->mc_txn->mt_env, tp), + page_used(mc->mc_txn->mt_env, tp), room); + + if (unlikely(numkeys < minkeys)) { + mdbx_debug("page %" PRIaPGNO " must be merged due keys < %u threshold", + tp->mp_pgno, minkeys); + } else if (unlikely(room > room_threshold)) { + mdbx_debug("page %" PRIaPGNO " should be merged due room %u > %u threshold", + tp->mp_pgno, room, room_threshold); + } else { + mdbx_debug("no need to rebalance page %" PRIaPGNO + ", room %u < %u threshold", + tp->mp_pgno, room, room_threshold); + mdbx_cassert(mc, mc->mc_db->md_entries > 0); + return MDBX_SUCCESS; + } + + int rc; + if (mc->mc_snum < 2) { + MDBX_page *const mp = mc->mc_pg[0]; + const unsigned nkeys = page_numkeys(mp); + mdbx_cassert(mc, (mc->mc_db->md_entries == 0) == (nkeys == 0)); + if (IS_SUBP(mp)) { + mdbx_debug("%s", "Can't rebalance a subpage, ignoring"); + mdbx_cassert(mc, pagetype & P_LEAF); + return MDBX_SUCCESS; + } + if (nkeys == 0) { + mdbx_cassert(mc, IS_LEAF(mp)); + mdbx_debug("%s", "tree is completely empty"); + mc->mc_db->md_root = P_INVALID; + mc->mc_db->md_depth = 0; + mdbx_cassert(mc, mc->mc_db->md_branch_pages == 0 && + mc->mc_db->md_overflow_pages == 0 && + mc->mc_db->md_leaf_pages == 1); + /* Adjust cursors pointing to mp */ + for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2; + m2 = m2->mc_next) { + MDBX_cursor *m3 = + (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (m3 == mc || !(m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_pg[0] == mp) { + m3->mc_snum = 0; + m3->mc_top = 0; + m3->mc_flags &= ~C_INITIALIZED; + } + } + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_flags &= ~C_INITIALIZED; + + rc = mdbx_page_retire(mc, mp); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } else if (IS_BRANCH(mp) && nkeys == 1) { + mdbx_debug("%s", "collapsing root page!"); + mc->mc_db->md_root = node_pgno(page_node(mp, 0)); + rc = mdbx_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], + pp_txnid4chk(mp, mc->mc_txn)); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + mc->mc_db->md_depth--; + mc->mc_ki[0] = mc->mc_ki[1]; + for (int i = 1; i < mc->mc_db->md_depth; i++) { + mc->mc_pg[i] = mc->mc_pg[i + 1]; + mc->mc_ki[i] = mc->mc_ki[i + 1]; + } + + /* Adjust other cursors pointing to mp */ + for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2; + m2 = m2->mc_next) { + MDBX_cursor *m3 = + (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (m3 == mc || !(m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_pg[0] == mp) { + for (int i = 0; i < mc->mc_db->md_depth; i++) { + m3->mc_pg[i] = m3->mc_pg[i + 1]; + m3->mc_ki[i] = m3->mc_ki[i + 1]; + } + m3->mc_snum--; + m3->mc_top--; + } + } + mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) || + PAGETYPE(mc->mc_pg[mc->mc_top]) == pagetype); + mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth || + IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); + + rc = mdbx_page_retire(mc, mp); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } else { + mdbx_debug("root page %" PRIaPGNO + " doesn't need rebalancing (flags 0x%x)", + mp->mp_pgno, mp->mp_flags); + } + return MDBX_SUCCESS; + } + + /* The parent (branch page) must have at least 2 pointers, + * otherwise the tree is invalid. */ + const unsigned pre_top = mc->mc_top - 1; + mdbx_cassert(mc, IS_BRANCH(mc->mc_pg[pre_top])); + mdbx_cassert(mc, !IS_SUBP(mc->mc_pg[0])); + mdbx_cassert(mc, page_numkeys(mc->mc_pg[pre_top]) > 1); + + /* Leaf page fill factor is below the threshold. + * Try to move keys from left or right neighbor, or + * merge with a neighbor page. */ + + /* Find neighbors. */ + MDBX_cursor mn; + cursor_copy(mc, &mn); + + MDBX_page *left = nullptr, *right = nullptr; + if (mn.mc_ki[pre_top] > 0) { + rc = mdbx_page_get( + &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] - 1)), + &left, pp_txnid4chk(mn.mc_pg[pre_top], mc->mc_txn)); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + mdbx_cassert(mc, PAGETYPE(left) == PAGETYPE(mc->mc_pg[mc->mc_top])); + } + if (mn.mc_ki[pre_top] + 1u < page_numkeys(mn.mc_pg[pre_top])) { + rc = mdbx_page_get( + &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] + 1)), + &right, pp_txnid4chk(mn.mc_pg[pre_top], mc->mc_txn)); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + mdbx_cassert(mc, PAGETYPE(right) == PAGETYPE(mc->mc_pg[mc->mc_top])); + } + mdbx_cassert(mc, left || right); + + const unsigned ki_top = mc->mc_ki[mc->mc_top]; + const unsigned ki_pre_top = mn.mc_ki[pre_top]; + const unsigned nkeys = page_numkeys(mn.mc_pg[mn.mc_top]); + + const unsigned left_room = left ? page_room(left) : 0; + const unsigned right_room = right ? page_room(right) : 0; + const unsigned left_nkeys = left ? page_numkeys(left) : 0; + const unsigned right_nkeys = right ? page_numkeys(right) : 0; +retry: + if (left_room > room_threshold && left_room >= right_room) { + /* try merge with left */ + mdbx_cassert(mc, left_nkeys >= minkeys); + mn.mc_pg[mn.mc_top] = left; + mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1); + mn.mc_ki[mn.mc_top] = (indx_t)(left_nkeys - 1); + mc->mc_ki[mc->mc_top] = 0; + const unsigned new_ki = ki_top + left_nkeys; + mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; + /* We want mdbx_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(mc, &mn)); + if (likely(rc != MDBX_RESULT_TRUE)) { + cursor_restore(&mn, mc); + mc->mc_ki[mc->mc_top] = (indx_t)new_ki; + mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + return rc; + } + } + if (right_room > room_threshold) { + /* try merge with right */ + mdbx_cassert(mc, right_nkeys >= minkeys); + mn.mc_pg[mn.mc_top] = right; + mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top + 1); + mn.mc_ki[mn.mc_top] = 0; + mc->mc_ki[mc->mc_top] = (indx_t)nkeys; + WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(&mn, mc)); + if (likely(rc != MDBX_RESULT_TRUE)) { + mc->mc_ki[mc->mc_top] = (indx_t)ki_top; + mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + return rc; + } + } + + if (left_nkeys > minkeys && + (right_nkeys <= left_nkeys || right_room >= left_room)) { + /* try move from left */ + mn.mc_pg[mn.mc_top] = left; + mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1); + mn.mc_ki[mn.mc_top] = (indx_t)(left_nkeys - 1); + mc->mc_ki[mc->mc_top] = 0; + WITH_CURSOR_TRACKING(mn, rc = mdbx_node_move(&mn, mc, true)); + if (likely(rc != MDBX_RESULT_TRUE)) { + mc->mc_ki[mc->mc_top] = (indx_t)(ki_top + 1); + mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + return rc; + } + } + if (right_nkeys > minkeys) { + /* try move from right */ + mn.mc_pg[mn.mc_top] = right; + mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top + 1); + mn.mc_ki[mn.mc_top] = 0; + mc->mc_ki[mc->mc_top] = (indx_t)nkeys; + WITH_CURSOR_TRACKING(mn, rc = mdbx_node_move(&mn, mc, false)); + if (likely(rc != MDBX_RESULT_TRUE)) { + mc->mc_ki[mc->mc_top] = (indx_t)ki_top; + mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + return rc; + } + } + + if (nkeys >= minkeys) { + mc->mc_ki[mc->mc_top] = (indx_t)ki_top; + if (!mdbx_audit_enabled()) + return MDBX_SUCCESS; + return mdbx_cursor_check(mc, C_UPDATING); + } + + if (likely(room_threshold > 0)) { + room_threshold = 0; + goto retry; + } + mdbx_error("Unable to merge/rebalance %s page %" PRIaPGNO + " (has %u keys, full %.1f%%, used %u, room %u bytes )", + (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, + page_fill(mc->mc_txn->mt_env, tp), + page_used(mc->mc_txn->mt_env, tp), room); + return MDBX_PROBLEM; +} + +static __cold int mdbx_page_check(MDBX_cursor *const mc, + const MDBX_page *const mp, unsigned options) { + DKBUF; + options |= mc->mc_flags & (C_COPYING | C_UPDATING | C_RETIRING | C_SKIPORD); + MDBX_env *const env = mc->mc_txn->mt_env; + const unsigned nkeys = page_numkeys(mp); + char *const end_of_page = (char *)mp + env->me_psize; + if (unlikely(mp->mp_pgno < MIN_PAGENO || mp->mp_pgno > MAX_PAGENO)) + return bad_page(mp, "invalid pgno (%u)\n", mp->mp_pgno); + if (IS_OVERFLOW(mp)) { + if (unlikely(mp->mp_pages < 1 && mp->mp_pages >= MAX_PAGENO / 2)) + return bad_page(mp, "invalid overflow n-pages (%u)\n", mp->mp_pages); + if (unlikely(mp->mp_pgno + mp->mp_pages > mc->mc_txn->mt_next_pgno)) + return bad_page(mp, "overflow page beyond (%u) next-pgno\n", + mp->mp_pgno + mp->mp_pages); + return MDBX_SUCCESS; + } + + int rc = MDBX_SUCCESS; + if ((options & C_UPDATING) == 0 || !IS_MODIFIABLE(mc->mc_txn, mp)) { + if (unlikely(nkeys < 2 && IS_BRANCH(mp))) + rc = bad_page(mp, "branch-page nkey (%u) < 2\n", nkeys); + } + + MDBX_val here, prev = {0, 0}; + for (unsigned i = 0; i < nkeys; ++i) { + if (IS_LEAF2(mp)) { + const size_t ksize = mp->mp_leaf2_ksize; + char *const key = page_leaf2key(mp, i, ksize); + if (unlikely(end_of_page < key + ksize)) { + rc = bad_page(mp, "leaf2-key beyond (%zu) page-end\n", + key + ksize - end_of_page); + continue; + } + + if ((options & C_COPYING) == 0) { + if (unlikely(ksize != mc->mc_dbx->md_klen_min)) { + if (unlikely(ksize < mc->mc_dbx->md_klen_min || + ksize > mc->mc_dbx->md_klen_max)) + rc = bad_page( + mp, "leaf2-key size (%zu) <> min/max key-length (%zu/%zu)\n", + ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max); + else + mc->mc_dbx->md_klen_min = mc->mc_dbx->md_klen_max = ksize; + } + if ((options & C_SKIPORD) == 0) { + here.iov_len = ksize; + here.iov_base = key; + if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) + rc = bad_page(mp, "leaf2-key #%u wrong order (%s >= %s)\n", i, + DKEY(&prev), DVAL(&here)); + prev = here; + } + } + } else { + const MDBX_node *const node = page_node(mp, i); + const char *node_end = (char *)node + NODESIZE; + if (unlikely(node_end > end_of_page)) { + rc = bad_page(mp, "node[%u] (%zu) beyond page-end\n", i, + node_end - end_of_page); + continue; + } + size_t ksize = node_ks(node); + char *key = node_key(node); + if (unlikely(end_of_page < key + ksize)) { + rc = bad_page(mp, "node[%u] key (%zu) beyond page-end\n", i, + key + ksize - end_of_page); + continue; + } + if ((IS_LEAF(mp) || i > 0) && (options & C_COPYING) == 0) { + if (unlikely(ksize < mc->mc_dbx->md_klen_min || + ksize > mc->mc_dbx->md_klen_max)) + rc = bad_page( + mp, "node[%u] key size (%zu) <> min/max key-length (%zu/%zu)\n", + i, ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max); + if ((options & C_SKIPORD) == 0) { + here.iov_base = key; + here.iov_len = ksize; + if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) + rc = bad_page(mp, "node[%u] key wrong order (%s >= %s)\n", i, + DKEY(&prev), DVAL(&here)); + prev = here; + } + } + if (IS_BRANCH(mp)) { + if ((options & C_UPDATING) == 0 && i == 0 && unlikely(ksize != 0)) + rc = bad_page(mp, "branch-node[%u] wrong 0-node key-length (%zu)\n", + i, ksize); + if ((options & C_RETIRING) == 0) { + const pgno_t ref = node_pgno(node); + if (unlikely(ref < MIN_PAGENO || ref >= mc->mc_txn->mt_next_pgno)) + rc = bad_page(mp, "branch-node[%u] wrong pgno (%u)\n", i, ref); + } + if (unlikely(node_flags(node))) + rc = bad_page(mp, "branch-node[%u] wrong flags (%u)\n", i, + node_flags(node)); + continue; + } + + switch (node_flags(node)) { + default: + rc = bad_page(mp, "invalid node[%u] flags (%u)\n", i, node_flags(node)); + break; + case F_BIGDATA /* data on large-page */: + case 0 /* usual */: + case F_SUBDATA /* sub-db */: + case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: + case F_DUPDATA /* short sub-page */: + break; + } + + if (node_flags(node) & F_BIGDATA) { + const size_t dsize = node_ds(node); + if ((options & C_COPYING) == 0) { + if (unlikely(dsize <= mc->mc_dbx->md_vlen_min || + dsize > mc->mc_dbx->md_vlen_max)) + rc = bad_page( + mp, + "big-node data size (%zu) <> min/max value-length (%zu/%zu)\n", + dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); + } + if ((options & C_RETIRING) == 0) { + MDBX_page *lp; + int err = mdbx_page_get(mc, node_largedata_pgno(node), &lp, + pp_txnid4chk(mp, mc->mc_txn)); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (unlikely(!IS_OVERFLOW(lp))) { + rc = bad_page(mp, "big-node refs to non-overflow page (%u)\n", + lp->mp_pgno); + continue; + } + if (unlikely(number_of_ovpages(env, dsize) > lp->mp_pages)) + rc = + bad_page(mp, "big-node size (%zu) mismatch n-pages size (%u)\n", + dsize, lp->mp_pages); + } + continue; + } + + const size_t dsize = node_ds(node); + const char *const data = node_data(node); + if (unlikely(end_of_page < data + dsize)) { + rc = bad_page(mp, + "node-data(%u of %u, %zu bytes) beyond (%zu) page-end\n", + i, nkeys, dsize, data + dsize - end_of_page); + continue; + } + + switch (node_flags(node)) { + default: + /* wrong, but already handled */ + continue; + case 0 /* usual */: + if ((options & C_COPYING) == 0) { + if (unlikely(dsize < mc->mc_dbx->md_vlen_min || + dsize > mc->mc_dbx->md_vlen_max)) { + rc = bad_page( + mp, "node-data size (%zu) <> min/max value-length (%zu/%zu)\n", + dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); + continue; + } + } + break; + case F_SUBDATA /* sub-db */: + if (unlikely(dsize != sizeof(MDBX_db))) { + rc = bad_page(mp, "invalid sub-db record size (%zu)\n", dsize); + continue; + } + break; + case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: + if (unlikely(dsize != sizeof(MDBX_db))) { + rc = bad_page(mp, "invalid nested-db record size (%zu)\n", dsize); + continue; + } + break; + case F_DUPDATA /* short sub-page */: + if (unlikely(dsize <= PAGEHDRSZ)) { + rc = bad_page(mp, "invalid nested-page record size (%zu)\n", dsize); + continue; + } else { + const MDBX_page *const sp = (MDBX_page *)data; + const char *const end_of_subpage = data + dsize; + const int nsubkeys = page_numkeys(sp); + switch (sp->mp_flags) { + case P_LEAF | P_SUBP: + case P_LEAF | P_LEAF2 | P_SUBP: + break; + default: + rc = bad_page(mp, "invalid nested-page flags (%u)\n", sp->mp_flags); + continue; + } + + MDBX_val sub_here, sub_prev = {0, 0}; + for (int j = 0; j < nsubkeys; j++) { + if (IS_LEAF2(sp)) { + /* LEAF2 pages have no mp_ptrs[] or node headers */ + size_t sub_ksize = sp->mp_leaf2_ksize; + char *sub_key = page_leaf2key(sp, j, sub_ksize); + if (unlikely(end_of_subpage < sub_key + sub_ksize)) { + rc = bad_page(mp, "nested-leaf2-key beyond (%zu) nested-page\n", + sub_key + sub_ksize - end_of_subpage); + continue; + } + + if ((options & C_COPYING) == 0) { + if (unlikely(sub_ksize != mc->mc_dbx->md_vlen_min)) { + if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || + sub_ksize > mc->mc_dbx->md_vlen_max)) { + rc = bad_page(mp, + "nested-leaf2-key size (%zu) <> min/max " + "value-length (%zu/%zu)\n", + sub_ksize, mc->mc_dbx->md_vlen_min, + mc->mc_dbx->md_vlen_max); + continue; + } + mc->mc_dbx->md_vlen_min = mc->mc_dbx->md_vlen_max = sub_ksize; + } + if ((options & C_SKIPORD) == 0) { + sub_here.iov_len = sub_ksize; + sub_here.iov_base = sub_key; + if (sub_prev.iov_base && + unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) + rc = bad_page( + mp, "nested-leaf2-key #%u wrong order (%s >= %s)\n", j, + DKEY(&sub_prev), DVAL(&sub_here)); + sub_prev = sub_here; + } + } + } else { + const MDBX_node *const sub_node = page_node(sp, j); + const char *sub_node_end = (char *)sub_node + NODESIZE; + if (unlikely(sub_node_end > end_of_subpage)) { + rc = bad_page(mp, "nested-node beyond (%zu) nested-page\n", + end_of_subpage - sub_node_end); + continue; + } + if (unlikely(node_flags(sub_node) != 0)) + rc = bad_page(mp, "nested-node invalid flags (%u)\n", + node_flags(sub_node)); + + size_t sub_ksize = node_ks(sub_node); + char *sub_key = node_key(sub_node); + size_t sub_dsize = node_ds(sub_node); + /* char *sub_data = node_data(sub_node); */ + + if ((options & C_COPYING) == 0) { + if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || + sub_ksize > mc->mc_dbx->md_vlen_max)) + rc = bad_page(mp, + "nested-node-key size (%zu) <> min/max " + "value-length (%zu/%zu)\n", + sub_ksize, mc->mc_dbx->md_vlen_min, + mc->mc_dbx->md_vlen_max); + + if ((options & C_SKIPORD) == 0) { + sub_here.iov_len = sub_ksize; + sub_here.iov_base = sub_key; + if (sub_prev.iov_base && + unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) + rc = bad_page( + mp, "nested-node-key #%u wrong order (%s >= %s)\n", j, + DKEY(&sub_prev), DVAL(&sub_here)); + sub_prev = sub_here; + } + } + if (unlikely(sub_dsize != 0)) + rc = bad_page(mp, "nested-node non-empty data size (%zu)\n", + sub_dsize); + if (unlikely(end_of_subpage < sub_key + sub_ksize)) + rc = bad_page(mp, "nested-node-key beyond (%zu) nested-page\n", + sub_key + sub_ksize - end_of_subpage); + } + } + } + break; + } + } + } + return rc; +} + +static __cold int mdbx_cursor_check(MDBX_cursor *mc, unsigned options) { + mdbx_cassert(mc, + mc->mc_txn->tw.dirtyroom + mc->mc_txn->tw.dirtylist->length == + (mc->mc_txn->mt_parent + ? mc->mc_txn->mt_parent->tw.dirtyroom + : mc->mc_txn->mt_env->me_options.dp_limit)); + mdbx_cassert(mc, mc->mc_top == mc->mc_snum - 1 || (options & C_UPDATING)); + if (unlikely(mc->mc_top != mc->mc_snum - 1) && (options & C_UPDATING) == 0) + return MDBX_CURSOR_FULL; + mdbx_cassert(mc, (options & C_UPDATING) ? mc->mc_snum <= mc->mc_db->md_depth + : mc->mc_snum == mc->mc_db->md_depth); + if (unlikely((options & C_UPDATING) ? mc->mc_snum > mc->mc_db->md_depth + : mc->mc_snum != mc->mc_db->md_depth)) + return MDBX_CURSOR_FULL; + + for (int n = 0; n < (int)mc->mc_snum; ++n) { + MDBX_page *mp = mc->mc_pg[n]; + const unsigned nkeys = page_numkeys(mp); + const bool expect_branch = (n < mc->mc_db->md_depth - 1) ? true : false; + const bool expect_nested_leaf = + (n + 1 == mc->mc_db->md_depth - 1) ? true : false; + const bool branch = IS_BRANCH(mp) ? true : false; + mdbx_cassert(mc, branch == expect_branch); + if (unlikely(branch != expect_branch)) + return MDBX_CURSOR_FULL; + if ((options & C_UPDATING) == 0) { + mdbx_cassert(mc, + nkeys > mc->mc_ki[n] || (!branch && nkeys == mc->mc_ki[n] && + (mc->mc_flags & C_EOF) != 0)); + if (unlikely(nkeys <= mc->mc_ki[n] && + !(!branch && nkeys == mc->mc_ki[n] && + (mc->mc_flags & C_EOF) != 0))) + return MDBX_CURSOR_FULL; + } else { + mdbx_cassert(mc, nkeys + 1 >= mc->mc_ki[n]); + if (unlikely(nkeys + 1 < mc->mc_ki[n])) + return MDBX_CURSOR_FULL; + } + + int err = mdbx_page_check(mc, mp, options); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + for (unsigned i = 0; i < nkeys; ++i) { + if (branch) { + MDBX_node *node = page_node(mp, i); + mdbx_cassert(mc, node_flags(node) == 0); + if (unlikely(node_flags(node) != 0)) + return MDBX_CURSOR_FULL; + pgno_t pgno = node_pgno(node); + MDBX_page *np; + int rc = mdbx_page_get(mc, pgno, &np, pp_txnid4chk(mp, mc->mc_txn)); + mdbx_cassert(mc, rc == MDBX_SUCCESS); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + const bool nested_leaf = IS_LEAF(np) ? true : false; + mdbx_cassert(mc, nested_leaf == expect_nested_leaf); + if (unlikely(nested_leaf != expect_nested_leaf)) + return MDBX_CURSOR_FULL; + err = mdbx_page_check(mc, np, options); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + } + } + return MDBX_SUCCESS; +} + +/* Complete a delete operation started by mdbx_cursor_del(). */ +static int mdbx_cursor_del0(MDBX_cursor *mc) { + int rc; + MDBX_page *mp; + indx_t ki; + unsigned nkeys; + MDBX_dbi dbi = mc->mc_dbi; + + mdbx_cassert(mc, cursor_is_tracked(mc)); + mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + ki = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + mdbx_node_del(mc, mc->mc_db->md_xsize); + mc->mc_db->md_entries--; + + /* Adjust other cursors pointing to mp */ + for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) { + MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] == ki) { + m3->mc_flags |= C_DEL; + if (mc->mc_db->md_flags & MDBX_DUPSORT) { + /* Sub-cursor referred into dataset which is gone */ + m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + } + continue; + } else if (m3->mc_ki[mc->mc_top] > ki) { + m3->mc_ki[mc->mc_top]--; + } + if (XCURSOR_INITED(m3)) + XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); + } + } + + rc = mdbx_rebalance(mc); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + if (unlikely(!mc->mc_snum)) { + /* DB is totally empty now, just bail out. + * Other cursors adjustments were already done + * by mdbx_rebalance and aren't needed here. */ + mdbx_cassert(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 && + mc->mc_db->md_root == P_INVALID); + mc->mc_flags |= C_EOF; + return MDBX_SUCCESS; + } + + ki = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + nkeys = page_numkeys(mp); + mdbx_cassert(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || + ((mc->mc_flags & C_SUB) && mc->mc_db->md_entries == 0 && + nkeys == 0)); + + /* Adjust this and other cursors pointing to mp */ + for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) { + MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + /* if m3 points past last node in page, find next sibling */ + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = mdbx_cursor_sibling(m3, SIBLING_RIGHT); + if (rc == MDBX_NOTFOUND) { + m3->mc_flags |= C_EOF; + rc = MDBX_SUCCESS; + continue; + } + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + if (m3->mc_ki[mc->mc_top] >= ki || + /* moved to right sibling */ m3->mc_pg[mc->mc_top] != mp) { + if (m3->mc_xcursor && !(m3->mc_flags & C_EOF)) { + MDBX_node *node = + page_node(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); + /* If this node has dupdata, it may need to be reinited + * because its data has moved. + * If the xcursor was not inited it must be reinited. + * Else if node points to a subDB, nothing is needed. */ + if (node_flags(node) & F_DUPDATA) { + if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + if (!(node_flags(node) & F_SUBDATA)) + m3->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); + } else { + rc = mdbx_xcursor_init1(m3, node, m3->mc_pg[m3->mc_top]); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + rc = mdbx_cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + } + m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; + } + m3->mc_flags |= C_DEL; + } + } + } + + mdbx_cassert(mc, rc == MDBX_SUCCESS); + if (mdbx_audit_enabled()) + rc = mdbx_cursor_check(mc, 0); + return rc; + +bailout: + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; + return rc; +} + +int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, + const MDBX_val *data) { + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!key)) + return MDBX_EINVAL; + + if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) + return MDBX_BAD_DBI; + + if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) + return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; + + return mdbx_del0(txn, dbi, key, data, 0); +} + +static int mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, + const MDBX_val *data, unsigned flags) { + MDBX_cursor_couple cx; + MDBX_cursor_op op; + MDBX_val rdata; + int rc; + DKBUF_DEBUG; + + mdbx_debug("====> delete db %u key [%s], data [%s]", dbi, DKEY_DEBUG(key), + DVAL_DEBUG(data)); + + rc = mdbx_cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (data) { + op = MDBX_GET_BOTH; + rdata = *data; + data = &rdata; + } else { + op = MDBX_SET; + flags |= MDBX_ALLDUPS; + } + rc = mdbx_cursor_set(&cx.outer, (MDBX_val *)key, (MDBX_val *)data, op).err; + if (likely(rc == MDBX_SUCCESS)) { + /* let mdbx_page_split know about this cursor if needed: + * delete will trigger a rebalance; if it needs to move + * a node from one page to another, it will have to + * update the parent's separator key(s). If the new sepkey + * is larger than the current one, the parent page may + * run out of space, triggering a split. We need this + * cursor to be consistent until the end of the rebalance. */ + cx.outer.mc_next = txn->tw.cursors[dbi]; + txn->tw.cursors[dbi] = &cx.outer; + rc = mdbx_cursor_del(&cx.outer, flags); + txn->tw.cursors[dbi] = cx.outer.mc_next; + } + return rc; +} + +/* Split a page and insert a new node. + * Set MDBX_TXN_ERROR on failure. + * [in,out] mc Cursor pointing to the page and desired insertion index. + * The cursor will be updated to point to the actual page and index where + * the node got inserted after the split. + * [in] newkey The key for the newly inserted node. + * [in] newdata The data for the newly inserted node. + * [in] newpgno The page number, if the new node is a branch node. + * [in] nflags The NODE_ADD_FLAGS for the new node. + * Returns 0 on success, non-zero on failure. */ +static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, + MDBX_val *const newdata, pgno_t newpgno, + unsigned nflags) { + unsigned flags; + int rc = MDBX_SUCCESS, foliage = 0; + unsigned i, ptop; + MDBX_env *const env = mc->mc_txn->mt_env; + MDBX_val sepkey, rkey, xdata; + MDBX_page *tmp_ki_copy = NULL; + DKBUF; + + MDBX_page *const mp = mc->mc_pg[mc->mc_top]; + const unsigned newindx = mc->mc_ki[mc->mc_top]; + unsigned nkeys = page_numkeys(mp); + if (mdbx_audit_enabled()) { + rc = mdbx_cursor_check(mc, C_UPDATING); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + STATIC_ASSERT(P_BRANCH == 1); + const unsigned minkeys = (mp->mp_flags & P_BRANCH) + 1; + + mdbx_debug(">> splitting %s-page %" PRIaPGNO + " and adding %zu+%zu [%s] at %i, nkeys %i", + IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, newkey->iov_len, + newdata ? newdata->iov_len : 0, DKEY_DEBUG(newkey), + mc->mc_ki[mc->mc_top], nkeys); + mdbx_cassert(mc, nkeys + 1 >= minkeys * 2); + + /* Create a new sibling page. */ + struct page_result npr = mdbx_page_new(mc, mp->mp_flags, 1); + if (unlikely(npr.err != MDBX_SUCCESS)) + return npr.err; + MDBX_page *const sister = npr.page; + sister->mp_leaf2_ksize = mp->mp_leaf2_ksize; + mdbx_debug("new sibling: page %" PRIaPGNO, sister->mp_pgno); + + /* Usually when splitting the root page, the cursor + * height is 1. But when called from mdbx_update_key, + * the cursor height may be greater because it walks + * up the stack while finding the branch slot to update. */ + if (mc->mc_top < 1) { + npr = mdbx_page_new(mc, P_BRANCH, 1); + rc = npr.err; + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + MDBX_page *const pp = npr.page; + /* shift current top to make room for new parent */ + mdbx_cassert(mc, mc->mc_snum < 2 && mc->mc_db->md_depth > 0); +#if MDBX_DEBUG + memset(mc->mc_pg + 3, 0, sizeof(mc->mc_pg) - sizeof(mc->mc_pg[0]) * 3); + memset(mc->mc_ki + 3, -1, sizeof(mc->mc_ki) - sizeof(mc->mc_ki[0]) * 3); +#endif + mc->mc_pg[2] = mc->mc_pg[1]; + mc->mc_ki[2] = mc->mc_ki[1]; + mc->mc_pg[1] = mc->mc_pg[0]; + mc->mc_ki[1] = mc->mc_ki[0]; + mc->mc_pg[0] = pp; + mc->mc_ki[0] = 0; + mc->mc_db->md_root = pp->mp_pgno; + mdbx_debug("root split! new root = %" PRIaPGNO, pp->mp_pgno); + foliage = mc->mc_db->md_depth++; + + /* Add left (implicit) pointer. */ + rc = mdbx_node_add_branch(mc, 0, NULL, mp->mp_pgno); + if (unlikely(rc != MDBX_SUCCESS)) { + /* undo the pre-push */ + mc->mc_pg[0] = mc->mc_pg[1]; + mc->mc_ki[0] = mc->mc_ki[1]; + mc->mc_db->md_root = mp->mp_pgno; + mc->mc_db->md_depth--; + goto done; + } + mc->mc_snum++; + mc->mc_top++; + ptop = 0; + if (mdbx_audit_enabled()) { + rc = mdbx_cursor_check(mc, C_UPDATING); + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + } + } else { + ptop = mc->mc_top - 1; + mdbx_debug("parent branch page is %" PRIaPGNO, mc->mc_pg[ptop]->mp_pgno); + } + + MDBX_cursor mn; + cursor_copy(mc, &mn); + mn.mc_pg[mn.mc_top] = sister; + mn.mc_ki[mn.mc_top] = 0; + mn.mc_ki[ptop] = mc->mc_ki[ptop] + 1; + + unsigned split_indx = + (newindx < nkeys) + ? /* split at the middle */ (nkeys + 1) / 2 + : /* split at the end (i.e. like append-mode ) */ nkeys - minkeys + 1; + + mdbx_cassert(mc, !IS_BRANCH(mp) || newindx > 0); + /* It is reasonable and possible to split the page at the begin */ + if (unlikely(newindx < minkeys)) { + split_indx = minkeys; + if (newindx == 0 && foliage == 0 && !(nflags & MDBX_SPLIT_REPLACE)) { + split_indx = 0; + /* Checking for ability of splitting by the left-side insertion + * of a pure page with the new key */ + for (i = 0; i < mc->mc_top; ++i) + if (mc->mc_ki[i]) { + get_key(page_node(mc->mc_pg[i], mc->mc_ki[i]), &sepkey); + if (mc->mc_dbx->md_cmp(newkey, &sepkey) >= 0) + split_indx = minkeys; + break; + } + if (split_indx == 0) { + /* Save the current first key which was omitted on the parent branch + * page and should be updated if the new first entry will be added */ + if (IS_LEAF2(mp)) { + sepkey.iov_len = mp->mp_leaf2_ksize; + sepkey.iov_base = page_leaf2key(mp, 0, sepkey.iov_len); + } else + get_key(page_node(mp, 0), &sepkey); + mdbx_cassert(mc, mc->mc_dbx->md_cmp(newkey, &sepkey) < 0); + /* Avoiding rare complex cases of split the parent page */ + if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey)) + split_indx = minkeys; + } + } + } + + const bool pure_right = split_indx == nkeys; + const bool pure_left = split_indx == 0; + if (unlikely(pure_right)) { + /* newindx == split_indx == nkeys */ + mdbx_trace("no-split, but add new pure page at the %s", "right/after"); + mdbx_cassert(mc, newindx == nkeys && split_indx == nkeys && minkeys == 1); + sepkey = *newkey; + } else if (unlikely(pure_left)) { + /* newindx == split_indx == 0 */ + mdbx_trace("no-split, but add new pure page at the %s", "left/before"); + mdbx_cassert(mc, newindx == 0 && split_indx == 0 && minkeys == 1); + mdbx_trace("old-first-key is %s", DKEY_DEBUG(&sepkey)); + } else { + if (IS_LEAF2(sister)) { + char *split, *ins; + unsigned lsize, rsize, ksize; + /* Move half of the keys to the right sibling */ + const int x = mc->mc_ki[mc->mc_top] - split_indx; + ksize = mc->mc_db->md_xsize; + split = page_leaf2key(mp, split_indx, ksize); + rsize = (nkeys - split_indx) * ksize; + lsize = (nkeys - split_indx) * sizeof(indx_t); + mdbx_cassert(mc, mp->mp_lower >= lsize); + mp->mp_lower -= (indx_t)lsize; + mdbx_cassert(mc, sister->mp_lower + lsize <= UINT16_MAX); + sister->mp_lower += (indx_t)lsize; + mdbx_cassert(mc, mp->mp_upper + rsize - lsize <= UINT16_MAX); + mp->mp_upper += (indx_t)(rsize - lsize); + mdbx_cassert(mc, sister->mp_upper >= rsize - lsize); + sister->mp_upper -= (indx_t)(rsize - lsize); + sepkey.iov_len = ksize; + sepkey.iov_base = (newindx != split_indx) ? split : newkey->iov_base; + if (x < 0) { + mdbx_cassert(mc, ksize >= sizeof(indx_t)); + ins = page_leaf2key(mp, mc->mc_ki[mc->mc_top], ksize); + memcpy(sister->mp_ptrs, split, rsize); + sepkey.iov_base = sister->mp_ptrs; + memmove(ins + ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); + memcpy(ins, newkey->iov_base, ksize); + mdbx_cassert(mc, UINT16_MAX - mp->mp_lower >= (int)sizeof(indx_t)); + mp->mp_lower += sizeof(indx_t); + mdbx_cassert(mc, mp->mp_upper >= ksize - sizeof(indx_t)); + mp->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); + } else { + memcpy(sister->mp_ptrs, split, x * ksize); + ins = page_leaf2key(sister, x, ksize); + memcpy(ins, newkey->iov_base, ksize); + memcpy(ins + ksize, split + x * ksize, rsize - x * ksize); + mdbx_cassert(mc, UINT16_MAX - sister->mp_lower >= (int)sizeof(indx_t)); + sister->mp_lower += sizeof(indx_t); + mdbx_cassert(mc, sister->mp_upper >= ksize - sizeof(indx_t)); + sister->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); + mdbx_cassert(mc, x <= (int)UINT16_MAX); + mc->mc_ki[mc->mc_top] = (indx_t)x; + } + + if (mdbx_audit_enabled()) { + rc = mdbx_cursor_check(mc, C_UPDATING); + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + rc = mdbx_cursor_check(&mn, C_UPDATING); + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + } + } else { + /* Maximum free space in an empty page */ + const unsigned max_space = page_space(env); + const size_t new_size = IS_LEAF(mp) ? leaf_size(env, newkey, newdata) + : branch_size(env, newkey); + + /* grab a page to hold a temporary copy */ + tmp_ki_copy = mdbx_page_malloc(mc->mc_txn, 1); + if (unlikely(tmp_ki_copy == NULL)) { + rc = MDBX_ENOMEM; + goto done; + } + + /* prepare to insert */ + for (unsigned j = i = 0; i < nkeys; ++i, ++j) { + tmp_ki_copy->mp_ptrs[j] = 0; + j += (i == newindx); + tmp_ki_copy->mp_ptrs[j] = mp->mp_ptrs[i]; + } + tmp_ki_copy->mp_pgno = mp->mp_pgno; + tmp_ki_copy->mp_flags = mp->mp_flags; + tmp_ki_copy->mp_txnid = INVALID_TXNID; + tmp_ki_copy->mp_lower = 0; + tmp_ki_copy->mp_upper = (indx_t)max_space; + + /* When items are relatively large the split point needs + * to be checked, because being off-by-one will make the + * difference between success or failure in mdbx_node_add. + * + * It's also relevant if a page happens to be laid out + * such that one half of its nodes are all "small" and + * the other half of its nodes are "large". If the new + * item is also "large" and falls on the half with + * "large" nodes, it also may not fit. + * + * As a final tweak, if the new item goes on the last + * spot on the page (and thus, onto the new page), bias + * the split so the new page is emptier than the old page. + * This yields better packing during sequential inserts. */ + + if (nkeys < 32 || new_size > max_space / 16) { + /* Find split point */ + int dir; + if (newindx <= split_indx) { + i = 0; + dir = 1; + } else { + i = nkeys; + dir = -1; + } + size_t before = 0, after = new_size + page_used(env, mp); + int best = split_indx; + int best_offset = nkeys + 1; + + mdbx_trace("seek separator from %u, step %i, default %u, new-idx %u, " + "new-size %zu", + i, dir, split_indx, newindx, new_size); + do { + mdbx_cassert(mc, i <= nkeys); + size_t size = new_size; + if (i != newindx) { + MDBX_node *node = + (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ); + size = NODESIZE + node_ks(node) + sizeof(indx_t); + if (IS_LEAF(mp)) + size += F_ISSET(node_flags(node), F_BIGDATA) ? sizeof(pgno_t) + : node_ds(node); + size = EVEN(size); + } + + before += size; + after -= size; + mdbx_trace("step %u, size %zu, before %zu, after %zu, max %u", i, + size, before, after, max_space); + + if (before <= max_space && after <= max_space) { + int offset = branchless_abs(split_indx - i); + if (offset >= best_offset) + break; + best_offset = offset; + best = i; + } + i += dir; + } while (i < nkeys); + + split_indx = best + (dir > 0); + split_indx = (split_indx <= nkeys - minkeys + 1) ? split_indx + : nkeys - minkeys + 1; + split_indx = (split_indx >= minkeys) ? split_indx : minkeys; + mdbx_trace("chosen %u", split_indx); + } + + sepkey.iov_len = newkey->iov_len; + sepkey.iov_base = newkey->iov_base; + if (split_indx != newindx) { + MDBX_node *node = + (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[split_indx] + + PAGEHDRSZ); + sepkey.iov_len = node_ks(node); + sepkey.iov_base = node_key(node); + } + } + } + mdbx_debug("separator is %d [%s]", split_indx, DKEY_DEBUG(&sepkey)); + + bool did_split_parent = false; + /* Copy separator key to the parent. */ + if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey)) { + mdbx_trace("need split parent branch-page for key %s", DKEY_DEBUG(&sepkey)); + mdbx_cassert(mc, page_numkeys(mn.mc_pg[ptop]) > 2); + mdbx_cassert(mc, !pure_left); + const int snum = mc->mc_snum; + const int depth = mc->mc_db->md_depth; + mn.mc_snum--; + mn.mc_top--; + did_split_parent = true; + /* We want other splits to find mn when doing fixups */ + WITH_CURSOR_TRACKING( + mn, rc = mdbx_page_split(&mn, &sepkey, NULL, sister->mp_pgno, 0)); + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + mdbx_cassert(mc, (int)mc->mc_snum - snum == mc->mc_db->md_depth - depth); + if (mdbx_audit_enabled()) { + rc = mdbx_cursor_check(mc, C_UPDATING); + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + } + + /* root split? */ + ptop += mc->mc_snum - snum; + + /* Right page might now have changed parent. + * Check if left page also changed parent. */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= page_numkeys(mc->mc_pg[ptop])) { + for (i = 0; i < ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + mc->mc_pg[ptop] = mn.mc_pg[ptop]; + if (mn.mc_ki[ptop]) { + mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; + } else { + /* find right page's left sibling */ + mc->mc_ki[ptop] = mn.mc_ki[ptop]; + rc = mdbx_cursor_sibling(mc, SIBLING_LEFT); + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc == MDBX_NOTFOUND) /* improper mdbx_cursor_sibling() result */ { + mdbx_error("unexpected %i error going left sibling", rc); + rc = MDBX_PROBLEM; + } + goto done; + } + } + } + } else if (unlikely(pure_left)) { + MDBX_page *ptop_page = mc->mc_pg[ptop]; + mdbx_debug("adding to parent page %u node[%u] left-leaf page #%u key %s", + ptop_page->mp_pgno, mc->mc_ki[ptop], sister->mp_pgno, + DKEY(mc->mc_ki[ptop] ? newkey : NULL)); + mc->mc_top--; + rc = mdbx_node_add_branch(mc, mc->mc_ki[ptop], + mc->mc_ki[ptop] ? newkey : NULL, sister->mp_pgno); + mdbx_cassert(mc, mp == mc->mc_pg[ptop + 1] && + newindx == mc->mc_ki[ptop + 1] && ptop == mc->mc_top); + + if (likely(rc == MDBX_SUCCESS) && mc->mc_ki[ptop] == 0) { + mdbx_debug("update prev-first key on parent %s", DKEY(&sepkey)); + MDBX_node *node = page_node(mc->mc_pg[ptop], 1); + mdbx_cassert(mc, node_ks(node) == 0 && node_pgno(node) == mp->mp_pgno); + mdbx_cassert(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 0); + mc->mc_ki[ptop] = 1; + rc = mdbx_update_key(mc, &sepkey); + mdbx_cassert(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 1); + mdbx_cassert(mc, + mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1]); + mc->mc_ki[ptop] = 0; + } + + mc->mc_top++; + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + + MDBX_node *node = page_node(mc->mc_pg[ptop], mc->mc_ki[ptop] + 1); + mdbx_cassert(mc, node_pgno(node) == mp->mp_pgno && + mc->mc_pg[ptop] == ptop_page); + } else { + mn.mc_top--; + mdbx_trace("add-to-parent the right-entry[%u] for new sibling-page", + mn.mc_ki[ptop]); + rc = mdbx_node_add_branch(&mn, mn.mc_ki[ptop], &sepkey, sister->mp_pgno); + mn.mc_top++; + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + } + + if (unlikely(pure_left | pure_right)) { + mc->mc_pg[mc->mc_top] = sister; + mc->mc_ki[mc->mc_top] = 0; + switch (PAGETYPE(sister)) { + case P_LEAF: { + mdbx_cassert(mc, newpgno == 0 || newpgno == P_INVALID); + rc = mdbx_node_add_leaf(mc, 0, newkey, newdata, nflags); + } break; + case P_LEAF | P_LEAF2: { + mdbx_cassert(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); + mdbx_cassert(mc, newpgno == 0 || newpgno == P_INVALID); + rc = mdbx_node_add_leaf2(mc, 0, newkey); + } break; + default: + rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE(sister)); + } + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + + if (pure_right) { + for (i = 0; i < mc->mc_top; i++) + mc->mc_ki[i] = mn.mc_ki[i]; + } else if (mc->mc_ki[mc->mc_top - 1] == 0) { + for (i = 2; i <= mc->mc_top; ++i) + if (mc->mc_ki[mc->mc_top - i]) { + get_key( + page_node(mc->mc_pg[mc->mc_top - i], mc->mc_ki[mc->mc_top - i]), + &sepkey); + if (mc->mc_dbx->md_cmp(newkey, &sepkey) < 0) { + mc->mc_top -= i; + mdbx_debug("update new-first on parent [%i] page %u key %s", + mc->mc_ki[mc->mc_top], mc->mc_pg[mc->mc_top]->mp_pgno, + DKEY(newkey)); + rc = mdbx_update_key(mc, newkey); + mc->mc_top += i; + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + } + break; + } + } + } else if (!IS_LEAF2(mp)) { + /* Move nodes */ + mc->mc_pg[mc->mc_top] = sister; + i = split_indx; + unsigned n = 0; + pgno_t pgno = 0; + do { + mdbx_trace("i %u, nkeys %u => n %u, rp #%u", i, nkeys, n, + sister->mp_pgno); + MDBX_val *rdata = NULL; + if (i == newindx) { + rkey.iov_base = newkey->iov_base; + rkey.iov_len = newkey->iov_len; + if (IS_LEAF(mp)) + rdata = newdata; + else + pgno = newpgno; + flags = nflags; + /* Update index for the new key. */ + mc->mc_ki[mc->mc_top] = (indx_t)n; + } else { + MDBX_node *node = + (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ); + rkey.iov_base = node_key(node); + rkey.iov_len = node_ks(node); + if (IS_LEAF(mp)) { + xdata.iov_base = node_data(node); + xdata.iov_len = node_ds(node); + rdata = &xdata; + } else + pgno = node_pgno(node); + flags = node_flags(node); + } + + switch (PAGETYPE(sister)) { + case P_BRANCH: { + mdbx_cassert(mc, 0 == (uint16_t)flags); + /* First branch index doesn't need key data. */ + rc = mdbx_node_add_branch(mc, n, n ? &rkey : NULL, pgno); + } break; + case P_LEAF: { + mdbx_cassert(mc, pgno == 0); + mdbx_cassert(mc, rdata != NULL); + rc = mdbx_node_add_leaf(mc, n, &rkey, rdata, flags); + } break; + /* case P_LEAF | P_LEAF2: { + mdbx_cassert(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); + mdbx_cassert(mc, gno == 0); + rc = mdbx_node_add_leaf2(mc, n, &rkey); + } break; */ + default: + rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE(sister)); + } + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + + ++n; + if (++i > nkeys) { + i = 0; + n = 0; + mc->mc_pg[mc->mc_top] = tmp_ki_copy; + mdbx_trace("switch to mp #%u", tmp_ki_copy->mp_pgno); + } + } while (i != split_indx); + + mdbx_trace("i %u, nkeys %u, n %u, pgno #%u", i, nkeys, n, + mc->mc_pg[mc->mc_top]->mp_pgno); + + nkeys = page_numkeys(tmp_ki_copy); + for (i = 0; i < nkeys; i++) + mp->mp_ptrs[i] = tmp_ki_copy->mp_ptrs[i]; + mp->mp_lower = tmp_ki_copy->mp_lower; + mp->mp_upper = tmp_ki_copy->mp_upper; + memcpy(page_node(mp, nkeys - 1), page_node(tmp_ki_copy, nkeys - 1), + env->me_psize - tmp_ki_copy->mp_upper - PAGEHDRSZ); + + /* reset back to original page */ + if (newindx < split_indx) { + mc->mc_pg[mc->mc_top] = mp; + } else { + mc->mc_pg[mc->mc_top] = sister; + mc->mc_ki[ptop]++; + /* Make sure mc_ki is still valid. */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= page_numkeys(mc->mc_pg[ptop])) { + for (i = 0; i <= ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + } + } + } else if (newindx >= split_indx) { + mc->mc_pg[mc->mc_top] = sister; + mc->mc_ki[ptop]++; + /* Make sure mc_ki is still valid. */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= page_numkeys(mc->mc_pg[ptop])) { + for (i = 0; i <= ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + } + } + + /* Adjust other cursors pointing to mp and/or to parent page */ + nkeys = page_numkeys(mp); + for (MDBX_cursor *m2 = mc->mc_txn->tw.cursors[mc->mc_dbi]; m2; + m2 = m2->mc_next) { + MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (m3 == mc) + continue; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (foliage) { + /* sub cursors may be on different DB */ + if (m3->mc_pg[0] != mp) + continue; + /* root split */ + for (int k = foliage; k >= 0; k--) { + m3->mc_ki[k + 1] = m3->mc_ki[k]; + m3->mc_pg[k + 1] = m3->mc_pg[k]; + } + m3->mc_ki[0] = (m3->mc_ki[0] >= nkeys) ? 1 : 0; + m3->mc_pg[0] = mc->mc_pg[0]; + m3->mc_snum++; + m3->mc_top++; + } + + if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp && !pure_left) { + if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDBX_SPLIT_REPLACE)) + m3->mc_ki[mc->mc_top]++; + if (m3->mc_ki[mc->mc_top] >= nkeys) { + m3->mc_pg[mc->mc_top] = sister; + mdbx_cassert(mc, m3->mc_ki[mc->mc_top] >= nkeys); + m3->mc_ki[mc->mc_top] -= (indx_t)nkeys; + for (i = 0; i < mc->mc_top; i++) { + m3->mc_ki[i] = mn.mc_ki[i]; + m3->mc_pg[i] = mn.mc_pg[i]; + } + } + } else if (!did_split_parent && m3->mc_top >= ptop && + m3->mc_pg[ptop] == mc->mc_pg[ptop] && + m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { + m3->mc_ki[ptop]++; /* also for the `pure-left` case */ + } + if (XCURSOR_INITED(m3) && IS_LEAF(mp)) + XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); + } + mdbx_trace("mp #%u left: %d, sister #%u left: %d", mp->mp_pgno, page_room(mp), + sister->mp_pgno, page_room(sister)); + +done: + if (tmp_ki_copy) + mdbx_dpage_free(env, tmp_ki_copy, 1); + + if (unlikely(rc != MDBX_SUCCESS)) + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; + else { + if (mdbx_audit_enabled()) + rc = mdbx_cursor_check(mc, C_UPDATING); + if (unlikely(nflags & MDBX_RESERVE)) { + MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!(node_flags(node) & F_BIGDATA)) + newdata->iov_base = node_data(node); + } +#if MDBX_ENABLE_PGOP_STAT + safe64_inc(&env->me_lck->mti_pgop_stat.split, 1); +#endif /* MDBX_ENABLE_PGOP_STAT */ + } + + mdbx_debug("<< mp #%u, rc %d", mp->mp_pgno, rc); + return rc; +} + +int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, + unsigned flags) { + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!key || !data)) + return MDBX_EINVAL; + + if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) + return MDBX_BAD_DBI; + + if (unlikely(flags & ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS | + MDBX_ALLDUPS | MDBX_RESERVE | MDBX_APPEND | + MDBX_APPENDDUP | MDBX_CURRENT | MDBX_MULTIPLE))) + return MDBX_EINVAL; + + if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) + return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; + + MDBX_cursor_couple cx; + rc = mdbx_cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + cx.outer.mc_next = txn->tw.cursors[dbi]; + txn->tw.cursors[dbi] = &cx.outer; + + /* LY: support for update (explicit overwrite) */ + if (flags & MDBX_CURRENT) { + rc = mdbx_cursor_get(&cx.outer, (MDBX_val *)key, NULL, MDBX_SET); + if (likely(rc == MDBX_SUCCESS) && + (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) && + (flags & MDBX_ALLDUPS) == 0) { + /* LY: allows update (explicit overwrite) only for unique keys */ + MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], + cx.outer.mc_ki[cx.outer.mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { + mdbx_tassert(txn, XCURSOR_INITED(&cx.outer) && + cx.outer.mc_xcursor->mx_db.md_entries > 1); + rc = MDBX_EMULTIVAL; + } + } + } + + if (likely(rc == MDBX_SUCCESS)) + rc = mdbx_cursor_put(&cx.outer, key, data, flags); + txn->tw.cursors[dbi] = cx.outer.mc_next; + + return rc; +} + +/**** COPYING *****************************************************************/ + +/* State needed for a double-buffering compacting copy. */ +typedef struct mdbx_copy { + MDBX_env *mc_env; + MDBX_txn *mc_txn; + mdbx_condpair_t mc_condpair; + uint8_t *mc_wbuf[2]; + uint8_t *mc_over[2]; + size_t mc_wlen[2]; + size_t mc_olen[2]; + mdbx_filehandle_t mc_fd; + /* Error code. Never cleared if set. Both threads can set nonzero + * to fail the copy. Not mutex-protected, MDBX expects atomic int. */ + volatile int mc_error; + pgno_t mc_next_pgno; + volatile unsigned mc_head; + volatile unsigned mc_tail; +} mdbx_copy; + +/* Dedicated writer thread for compacting copy. */ +static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { + mdbx_copy *my = arg; + +#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) + sigset_t sigset; + sigemptyset(&sigset); + sigaddset(&sigset, SIGPIPE); + my->mc_error = pthread_sigmask(SIG_BLOCK, &sigset, NULL); +#endif /* EPIPE */ + + mdbx_condpair_lock(&my->mc_condpair); + while (!my->mc_error) { + while (my->mc_tail == my->mc_head && !my->mc_error) { + int err = mdbx_condpair_wait(&my->mc_condpair, true); + if (err != MDBX_SUCCESS) { + my->mc_error = err; + goto bailout; + } + } + const unsigned toggle = my->mc_tail & 1; + size_t wsize = my->mc_wlen[toggle]; + if (wsize == 0) { + my->mc_tail += 1; + break /* EOF */; + } + my->mc_wlen[toggle] = 0; + uint8_t *ptr = my->mc_wbuf[toggle]; + again: + if (!my->mc_error) { + int err = mdbx_write(my->mc_fd, ptr, wsize); + if (err != MDBX_SUCCESS) { +#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) + if (err == EPIPE) { + /* Collect the pending SIGPIPE, + * otherwise at least OS X gives it to the process on thread-exit. */ + int unused; + sigwait(&sigset, &unused); + } +#endif /* EPIPE */ + my->mc_error = err; + goto bailout; + } + } + + /* If there's an overflow page tail, write it too */ + wsize = my->mc_olen[toggle]; + if (wsize) { + my->mc_olen[toggle] = 0; + ptr = my->mc_over[toggle]; + goto again; + } + my->mc_tail += 1; + mdbx_condpair_signal(&my->mc_condpair, false); + } +bailout: + mdbx_condpair_unlock(&my->mc_condpair); + return (THREAD_RESULT)0; +} + +/* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. */ +static __cold int mdbx_env_cthr_toggle(mdbx_copy *my) { + mdbx_condpair_lock(&my->mc_condpair); + mdbx_assert(my->mc_env, my->mc_head - my->mc_tail < 2 || my->mc_error); + my->mc_head += 1; + mdbx_condpair_signal(&my->mc_condpair, true); + while (!my->mc_error && + my->mc_head - my->mc_tail == 2 /* both buffers in use */) { + int err = mdbx_condpair_wait(&my->mc_condpair, false); + if (err != MDBX_SUCCESS) + my->mc_error = err; + } + mdbx_condpair_unlock(&my->mc_condpair); + return my->mc_error; +} + +/* Depth-first tree traversal for compacting copy. + * [in] my control structure. + * [in,out] pg database root. + * [in] flags includes F_DUPDATA if it is a sorted-duplicate sub-DB. */ +static __cold int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { + MDBX_cursor_couple couple; + MDBX_page *mo, *mp, *leaf; + char *buf, *ptr; + int rc; + unsigned i; + + /* Empty DB, nothing to do */ + if (*pg == P_INVALID) + return MDBX_SUCCESS; + + memset(&couple, 0, sizeof(couple)); + couple.outer.mc_snum = 1; + couple.outer.mc_txn = my->mc_txn; + couple.outer.mc_flags = couple.inner.mx_cursor.mc_flags = + C_COPYING | C_SKIPORD; + + rc = mdbx_page_get(&couple.outer, *pg, &couple.outer.mc_pg[0], + my->mc_txn->mt_txnid); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + rc = mdbx_page_search_root(&couple.outer, NULL, MDBX_PS_FIRST); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + /* Make cursor pages writable */ + buf = ptr = mdbx_malloc(pgno2bytes(my->mc_env, couple.outer.mc_snum)); + if (buf == NULL) + return MDBX_ENOMEM; + + for (i = 0; i < couple.outer.mc_top; i++) { + mdbx_page_copy((MDBX_page *)ptr, couple.outer.mc_pg[i], + my->mc_env->me_psize); + couple.outer.mc_pg[i] = (MDBX_page *)ptr; + ptr += my->mc_env->me_psize; + } + + /* This is writable space for a leaf page. Usually not needed. */ + leaf = (MDBX_page *)ptr; + + while (couple.outer.mc_snum > 0) { + mp = couple.outer.mc_pg[couple.outer.mc_top]; + unsigned n = page_numkeys(mp); + + if (IS_LEAF(mp)) { + if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { + for (i = 0; i < n; i++) { + MDBX_node *node = page_node(mp, i); + if (node_flags(node) & F_BIGDATA) { + MDBX_page *omp; + + /* Need writable leaf */ + if (mp != leaf) { + couple.outer.mc_pg[couple.outer.mc_top] = leaf; + mdbx_page_copy(leaf, mp, my->mc_env->me_psize); + mp = leaf; + node = page_node(mp, i); + } + + const pgno_t pgno = node_largedata_pgno(node); + poke_pgno(node_data(node), my->mc_next_pgno); + rc = mdbx_page_get(&couple.outer, pgno, &omp, + pp_txnid4chk(mp, my->mc_txn)); + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + unsigned toggle = my->mc_head & 1; + if (my->mc_wlen[toggle] + my->mc_env->me_psize > + ((size_t)(MDBX_ENVCOPY_WRITEBUF))) { + rc = mdbx_env_cthr_toggle(my); + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + toggle = my->mc_head & 1; + } + mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); + memcpy(mo, omp, my->mc_env->me_psize); + mo->mp_pgno = my->mc_next_pgno; + my->mc_next_pgno += omp->mp_pages; + my->mc_wlen[toggle] += my->mc_env->me_psize; + if (omp->mp_pages > 1) { + my->mc_olen[toggle] = pgno2bytes(my->mc_env, omp->mp_pages - 1); + my->mc_over[toggle] = (uint8_t *)omp + my->mc_env->me_psize; + rc = mdbx_env_cthr_toggle(my); + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + toggle = my->mc_head & 1; + } + } else if (node_flags(node) & F_SUBDATA) { + if (!MDBX_DISABLE_PAGECHECKS && + unlikely(node_ds(node) != sizeof(MDBX_db))) { + rc = MDBX_CORRUPTED; + goto done; + } + + /* Need writable leaf */ + if (mp != leaf) { + couple.outer.mc_pg[couple.outer.mc_top] = leaf; + mdbx_page_copy(leaf, mp, my->mc_env->me_psize); + mp = leaf; + node = page_node(mp, i); + } + + MDBX_db db; + memcpy(&db, node_data(node), sizeof(MDBX_db)); + rc = mdbx_env_cwalk(my, &db.md_root, node_flags(node) & F_DUPDATA); + if (rc) + goto done; + memcpy(node_data(node), &db, sizeof(MDBX_db)); + } + } + } + } else { + couple.outer.mc_ki[couple.outer.mc_top]++; + if (couple.outer.mc_ki[couple.outer.mc_top] < n) { + again: + rc = mdbx_page_get( + &couple.outer, + node_pgno(page_node(mp, couple.outer.mc_ki[couple.outer.mc_top])), + &mp, pp_txnid4chk(mp, my->mc_txn)); + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + couple.outer.mc_top++; + couple.outer.mc_snum++; + couple.outer.mc_ki[couple.outer.mc_top] = 0; + if (IS_BRANCH(mp)) { + /* Whenever we advance to a sibling branch page, + * we must proceed all the way down to its first leaf. */ + mdbx_page_copy(couple.outer.mc_pg[couple.outer.mc_top], mp, + my->mc_env->me_psize); + goto again; + } else + couple.outer.mc_pg[couple.outer.mc_top] = mp; + continue; + } + } + unsigned toggle = my->mc_head & 1; + if (my->mc_wlen[toggle] + my->mc_wlen[toggle] > + ((size_t)(MDBX_ENVCOPY_WRITEBUF))) { + rc = mdbx_env_cthr_toggle(my); + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + toggle = my->mc_head & 1; + } + mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); + mdbx_page_copy(mo, mp, my->mc_env->me_psize); + mo->mp_pgno = my->mc_next_pgno++; + my->mc_wlen[toggle] += my->mc_env->me_psize; + if (couple.outer.mc_top) { + /* Update parent if there is one */ + node_set_pgno(page_node(couple.outer.mc_pg[couple.outer.mc_top - 1], + couple.outer.mc_ki[couple.outer.mc_top - 1]), + mo->mp_pgno); + mdbx_cursor_pop(&couple.outer); + } else { + /* Otherwise we're done */ + *pg = mo->mp_pgno; + break; + } + } +done: + mdbx_free(buf); + return rc; +} + +static __cold void compact_fixup_meta(MDBX_env *env, MDBX_meta *meta) { + /* Calculate filesize taking in account shrink/growing thresholds */ + if (meta->mm_geo.next != meta->mm_geo.now) { + meta->mm_geo.now = meta->mm_geo.next; + const pgno_t aligner = pv2pages( + meta->mm_geo.grow_pv ? meta->mm_geo.grow_pv : meta->mm_geo.shrink_pv); + if (aligner) { + const pgno_t aligned = pgno_align2os_pgno( + env, meta->mm_geo.next + aligner - meta->mm_geo.next % aligner); + meta->mm_geo.now = aligned; + } + } + + if (meta->mm_geo.now < meta->mm_geo.lower) + meta->mm_geo.now = meta->mm_geo.lower; + if (meta->mm_geo.now > meta->mm_geo.upper) + meta->mm_geo.now = meta->mm_geo.upper; + + /* Update signature */ + assert(meta->mm_geo.now >= meta->mm_geo.next); + unaligned_poke_u64(4, meta->mm_datasync_sign, mdbx_meta_sign(meta)); +} + +/* Make resizeable */ +static __cold void make_sizeable(MDBX_meta *meta) { + meta->mm_geo.lower = MIN_PAGENO; + if (meta->mm_geo.grow_pv == 0) { + const pgno_t step = 1 + (meta->mm_geo.upper - meta->mm_geo.lower) / 42; + meta->mm_geo.grow_pv = pages2pv(step); + } + if (meta->mm_geo.shrink_pv == 0) { + const pgno_t step = pv2pages(meta->mm_geo.grow_pv) << 1; + meta->mm_geo.shrink_pv = pages2pv(step); + } +} + +/* Copy environment with compaction. */ +static __cold int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, + mdbx_filehandle_t fd, uint8_t *buffer, + const bool dest_is_pipe, const int flags) { + const size_t meta_bytes = pgno2bytes(env, NUM_METAS); + uint8_t *const data_buffer = + buffer + ceil_powerof2(meta_bytes, env->me_os_psize); + MDBX_meta *const meta = mdbx_init_metas(env, buffer); + mdbx_meta_set_txnid(env, meta, read_txn->mt_txnid); + + if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) + make_sizeable(meta); + + /* copy canary sequences if present */ + if (read_txn->mt_canary.v) { + meta->mm_canary = read_txn->mt_canary; + meta->mm_canary.v = mdbx_meta_txnid_stable(env, meta); + } + + /* Set metapage 1 with current main DB */ + pgno_t new_root, root = read_txn->mt_dbs[MAIN_DBI].md_root; + if ((new_root = root) == P_INVALID) { + /* When the DB is empty, handle it specially to + * fix any breakage like page leaks from ITS#8174. */ + meta->mm_dbs[MAIN_DBI].md_flags = read_txn->mt_dbs[MAIN_DBI].md_flags; + compact_fixup_meta(env, meta); + if (dest_is_pipe) { + int rc = mdbx_write(fd, buffer, meta_bytes); + if (rc != MDBX_SUCCESS) + return rc; + } + } else { + /* Count free pages + GC pages. Subtract from last_pg + * to find the new last_pg, which also becomes the new root. */ + pgno_t freecount = 0; + MDBX_cursor_couple couple; + MDBX_val key, data; + + int rc = mdbx_cursor_init(&couple.outer, read_txn, FREE_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + while ((rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_NEXT)) == 0) + freecount += *(pgno_t *)data.iov_base; + if (unlikely(rc != MDBX_NOTFOUND)) + return rc; + + freecount += read_txn->mt_dbs[FREE_DBI].md_branch_pages + + read_txn->mt_dbs[FREE_DBI].md_leaf_pages + + read_txn->mt_dbs[FREE_DBI].md_overflow_pages; + + new_root = read_txn->mt_next_pgno - 1 - freecount; + meta->mm_geo.next = new_root + 1; + meta->mm_dbs[MAIN_DBI] = read_txn->mt_dbs[MAIN_DBI]; + meta->mm_dbs[MAIN_DBI].md_root = new_root; + + mdbx_copy ctx; + memset(&ctx, 0, sizeof(ctx)); + rc = mdbx_condpair_init(&ctx.mc_condpair); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF)) * 2); + ctx.mc_wbuf[0] = data_buffer; + ctx.mc_wbuf[1] = data_buffer + ((size_t)(MDBX_ENVCOPY_WRITEBUF)); + ctx.mc_next_pgno = NUM_METAS; + ctx.mc_env = env; + ctx.mc_fd = fd; + ctx.mc_txn = read_txn; + + mdbx_thread_t thread; + int thread_err = mdbx_thread_create(&thread, mdbx_env_copythr, &ctx); + if (likely(thread_err == MDBX_SUCCESS)) { + if (dest_is_pipe) { + compact_fixup_meta(env, meta); + rc = mdbx_write(fd, buffer, meta_bytes); + } + if (rc == MDBX_SUCCESS) + rc = mdbx_env_cwalk(&ctx, &root, 0); + mdbx_env_cthr_toggle(&ctx); + mdbx_env_cthr_toggle(&ctx); + thread_err = mdbx_thread_join(thread); + mdbx_assert(env, (ctx.mc_tail == ctx.mc_head && + ctx.mc_wlen[ctx.mc_head & 1] == 0) || + ctx.mc_error); + mdbx_condpair_destroy(&ctx.mc_condpair); + } + if (unlikely(thread_err != MDBX_SUCCESS)) + return thread_err; + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + if (unlikely(ctx.mc_error != MDBX_SUCCESS)) + return ctx.mc_error; + + if (dest_is_pipe) { + if (unlikely(root != new_root)) { + mdbx_error("post-compactification root %" PRIaPGNO + " NE expected %" PRIaPGNO + " (source DB corrupted or has a page leak(s))", + root, new_root); + return MDBX_CORRUPTED; /* page leak or corrupt DB */ + } + } else { + if (unlikely(root > new_root)) { + mdbx_error("post-compactification root %" PRIaPGNO + " GT expected %" PRIaPGNO " (source DB corrupted)", + root, new_root); + return MDBX_CORRUPTED; /* page leak or corrupt DB */ + } + if (unlikely(root < new_root)) { + mdbx_warning("post-compactification root %" PRIaPGNO + " LT expected %" PRIaPGNO " (page leak(s) in source DB)", + root, new_root); + /* fixup meta */ + meta->mm_dbs[MAIN_DBI].md_root = root; + meta->mm_geo.next = root + 1; + } + compact_fixup_meta(env, meta); + } + } + + /* Extend file if required */ + if (meta->mm_geo.now != meta->mm_geo.next) { + const size_t whole_size = pgno2bytes(env, meta->mm_geo.now); + if (!dest_is_pipe) + return mdbx_ftruncate(fd, whole_size); + + const size_t used_size = pgno2bytes(env, meta->mm_geo.next); + memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF))); + for (size_t offset = used_size; offset < whole_size;) { + const size_t chunk = + (((size_t)(MDBX_ENVCOPY_WRITEBUF)) < whole_size - offset) + ? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) + : whole_size - offset; + /* copy to avoid EFAULT in case swapped-out */ + int rc = mdbx_write(fd, data_buffer, chunk); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + offset += chunk; + } + } + return MDBX_SUCCESS; +} + +/* Copy environment as-is. */ +static __cold int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, + mdbx_filehandle_t fd, uint8_t *buffer, + const bool dest_is_pipe, const int flags) { + /* We must start the actual read txn after blocking writers */ + int rc = mdbx_txn_end(read_txn, MDBX_END_RESET_TMP); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + /* Temporarily block writers until we snapshot the meta pages */ + rc = mdbx_txn_lock(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + rc = mdbx_txn_renew0(read_txn, MDBX_TXN_RDONLY); + if (unlikely(rc != MDBX_SUCCESS)) { + mdbx_txn_unlock(env); + return rc; + } + + mdbx_jitter4testing(false); + const size_t meta_bytes = pgno2bytes(env, NUM_METAS); + /* Make a snapshot of meta-pages, + * but writing ones after the data was flushed */ + memcpy(buffer, env->me_map, meta_bytes); + MDBX_meta *const headcopy = /* LY: get pointer to the snapshot copy */ + (MDBX_meta *)(buffer + ((uint8_t *)mdbx_meta_head(env) - env->me_map)); + mdbx_txn_unlock(env); + + if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) + make_sizeable(headcopy); + /* Update signature to steady */ + unaligned_poke_u64(4, headcopy->mm_datasync_sign, mdbx_meta_sign(headcopy)); + + /* Copy the data */ + const size_t whole_size = pgno_align2os_bytes(env, read_txn->mt_end_pgno); + const size_t used_size = pgno2bytes(env, read_txn->mt_next_pgno); + mdbx_jitter4testing(false); + + if (dest_is_pipe) + rc = mdbx_write(fd, buffer, meta_bytes); + + uint8_t *const data_buffer = + buffer + ceil_powerof2(meta_bytes, env->me_os_psize); + for (size_t offset = meta_bytes; rc == MDBX_SUCCESS && offset < used_size;) { +#if MDBX_USE_SENDFILE + static bool sendfile_unavailable; + if (dest_is_pipe && likely(!sendfile_unavailable)) { + off_t in_offset = offset; + const ssize_t written = + sendfile(fd, env->me_lazy_fd, &in_offset, used_size - offset); + if (likely(written > 0)) { + offset = in_offset; + continue; + } + rc = MDBX_ENODATA; + if (written == 0 || ignore_enosys(rc = errno) != MDBX_RESULT_TRUE) + break; + sendfile_unavailable = true; + } +#endif /* MDBX_USE_SENDFILE */ + +#if MDBX_USE_COPYFILERANGE + static bool copyfilerange_unavailable; + if (!dest_is_pipe && likely(!copyfilerange_unavailable)) { + off_t in_offset = offset, out_offset = offset; + ssize_t bytes_copied = copy_file_range( + env->me_lazy_fd, &in_offset, fd, &out_offset, used_size - offset, 0); + if (likely(bytes_copied > 0)) { + offset = in_offset; + continue; + } + rc = MDBX_ENODATA; + if (bytes_copied == 0 || ignore_enosys(rc = errno) != MDBX_RESULT_TRUE) + break; + copyfilerange_unavailable = true; + } +#endif /* MDBX_USE_COPYFILERANGE */ + + /* fallback to portable */ + const size_t chunk = + (((size_t)(MDBX_ENVCOPY_WRITEBUF)) < used_size - offset) + ? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) + : used_size - offset; + /* copy to avoid EFAULT in case swapped-out */ + memcpy(data_buffer, env->me_map + offset, chunk); + rc = mdbx_write(fd, data_buffer, chunk); + offset += chunk; + } + + /* Extend file if required */ + if (likely(rc == MDBX_SUCCESS) && whole_size != used_size) { + if (!dest_is_pipe) + rc = mdbx_ftruncate(fd, whole_size); + else { + memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF))); + for (size_t offset = used_size; + rc == MDBX_SUCCESS && offset < whole_size;) { + const size_t chunk = + (((size_t)(MDBX_ENVCOPY_WRITEBUF)) < whole_size - offset) + ? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) + : whole_size - offset; + /* copy to avoid EFAULT in case swapped-out */ + rc = mdbx_write(fd, data_buffer, chunk); + offset += chunk; + } + } + } + + return rc; +} + +__cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, + unsigned flags) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + const int dest_is_pipe = mdbx_is_pipe(fd); + if (MDBX_IS_ERROR(dest_is_pipe)) + return dest_is_pipe; + + if (!dest_is_pipe) { + rc = mdbx_fseek(fd, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + + const size_t buffer_size = + pgno_align2os_bytes(env, NUM_METAS) + + ceil_powerof2(((flags & MDBX_CP_COMPACT) + ? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) * 2 + : ((size_t)(MDBX_ENVCOPY_WRITEBUF))), + env->me_os_psize); + + uint8_t *buffer = NULL; + rc = mdbx_memalign_alloc(env->me_os_psize, buffer_size, (void **)&buffer); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + MDBX_txn *read_txn = NULL; + /* Do the lock/unlock of the reader mutex before starting the + * write txn. Otherwise other read txns could block writers. */ + rc = mdbx_txn_begin(env, NULL, MDBX_TXN_RDONLY, &read_txn); + if (unlikely(rc != MDBX_SUCCESS)) { + mdbx_memalign_free(buffer); + return rc; + } + + if (!dest_is_pipe) { + /* Firstly write a stub to meta-pages. + * Now we sure to incomplete copy will not be used. */ + memset(buffer, -1, pgno2bytes(env, NUM_METAS)); + rc = mdbx_write(fd, buffer, pgno2bytes(env, NUM_METAS)); + } + + if (likely(rc == MDBX_SUCCESS)) { + memset(buffer, 0, pgno2bytes(env, NUM_METAS)); + rc = ((flags & MDBX_CP_COMPACT) ? mdbx_env_compact : mdbx_env_copy_asis)( + env, read_txn, fd, buffer, dest_is_pipe, flags); + } + mdbx_txn_abort(read_txn); + + if (!dest_is_pipe) { + if (likely(rc == MDBX_SUCCESS)) + rc = mdbx_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE); + + /* Write actual meta */ + if (likely(rc == MDBX_SUCCESS)) + rc = mdbx_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0); + + if (likely(rc == MDBX_SUCCESS)) + rc = mdbx_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } + + mdbx_memalign_free(buffer); + return rc; +} + +__cold int mdbx_env_copy(MDBX_env *env, const char *dest_path, + MDBX_copy_flags_t flags) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!dest_path)) + return MDBX_EINVAL; + + /* The destination path must exist, but the destination file must not. + * We don't want the OS to cache the writes, since the source data is + * already in the OS cache. */ + mdbx_filehandle_t newfd; + rc = mdbx_openfile(MDBX_OPEN_COPY, env, dest_path, &newfd, +#if defined(_WIN32) || defined(_WIN64) + (mdbx_mode_t)-1 +#else + S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP +#endif + ); + + if (rc == MDBX_SUCCESS) { +#if defined(_WIN32) || defined(_WIN64) + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); + if (!LockFileEx(newfd, LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, + 0, 0, INT32_MAX, &ov)) + rc = GetLastError(); +#else + struct flock lock_op; + memset(&lock_op, 0, sizeof(lock_op)); + lock_op.l_type = F_WRLCK; + lock_op.l_whence = SEEK_SET; + lock_op.l_start = 0; + lock_op.l_len = + (sizeof(lock_op.l_len) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff; + if (fcntl(newfd, F_SETLK, &lock_op) +#if (defined(__linux__) || defined(__gnu_linux__)) && defined(LOCK_EX) && \ + (!defined(__ANDROID_API__) || __ANDROID_API__ >= 24) + || flock(newfd, LOCK_EX | LOCK_NB) +#endif /* Linux */ + ) + rc = errno; +#endif /* Windows / POSIX */ + } + + if (rc == MDBX_SUCCESS) + rc = mdbx_env_copy2fd(env, newfd, flags); + + if (newfd != INVALID_HANDLE_VALUE) { + int err = mdbx_closefile(newfd); + if (rc == MDBX_SUCCESS && err != rc) + rc = err; + if (rc != MDBX_SUCCESS) + (void)mdbx_removefile(dest_path); + } + + return rc; +} + +/******************************************************************************/ + +__cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, + bool onoff) { + int rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(flags & + ((env->me_flags & MDBX_ENV_ACTIVE) ? ~ENV_CHANGEABLE_FLAGS + : ~ENV_USABLE_FLAGS))) + return MDBX_EPERM; + + if (unlikely(env->me_flags & MDBX_RDONLY)) + return MDBX_EACCESS; + + if ((env->me_flags & MDBX_ENV_ACTIVE) && + unlikely(env->me_txn0->mt_owner == mdbx_thread_self())) + return MDBX_BUSY; + + const bool lock_needed = (env->me_flags & MDBX_ENV_ACTIVE) && + env->me_txn0->mt_owner != mdbx_thread_self(); + bool should_unlock = false; + if (lock_needed) { + rc = mdbx_txn_lock(env, false); + if (unlikely(rc)) + return rc; + should_unlock = true; + } + + if (onoff) + env->me_flags = merge_sync_flags(env->me_flags, flags); + else + env->me_flags &= ~flags; + + if (should_unlock) + mdbx_txn_unlock(env); + return MDBX_SUCCESS; +} + +__cold int mdbx_env_get_flags(const MDBX_env *env, unsigned *arg) { + int rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!arg)) + return MDBX_EINVAL; + + *arg = env->me_flags & ENV_USABLE_FLAGS; + return MDBX_SUCCESS; +} + +__cold int mdbx_env_set_userctx(MDBX_env *env, void *ctx) { + int rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + env->me_userctx = ctx; + return MDBX_SUCCESS; +} + +void *__cold mdbx_env_get_userctx(const MDBX_env *env) { + return env ? env->me_userctx : NULL; +} + +__cold int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) { + int rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + +#if MDBX_DEBUG + env->me_assert_func = func; + return MDBX_SUCCESS; +#else + (void)func; + return MDBX_ENOSYS; +#endif +} + +__cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!arg)) + return MDBX_EINVAL; + + *arg = env->me_pathname; + return MDBX_SUCCESS; +} + +__cold int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *arg) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!arg)) + return MDBX_EINVAL; + + *arg = env->me_lazy_fd; + return MDBX_SUCCESS; +} + +#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API +__cold int mdbx_env_stat(const MDBX_env *env, MDBX_stat *stat, size_t bytes) { + return __inline_mdbx_env_stat(env, stat, bytes); +} +#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ + +static void stat_get(const MDBX_db *db, MDBX_stat *st, size_t bytes) { + st->ms_depth = db->md_depth; + st->ms_branch_pages = db->md_branch_pages; + st->ms_leaf_pages = db->md_leaf_pages; + st->ms_overflow_pages = db->md_overflow_pages; + st->ms_entries = db->md_entries; + if (likely(bytes >= + offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid))) + st->ms_mod_txnid = db->md_mod_txnid; +} + +static void stat_add(const MDBX_db *db, MDBX_stat *const st, + const size_t bytes) { + st->ms_depth += db->md_depth; + st->ms_branch_pages += db->md_branch_pages; + st->ms_leaf_pages += db->md_leaf_pages; + st->ms_overflow_pages += db->md_overflow_pages; + st->ms_entries += db->md_entries; + if (likely(bytes >= + offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid))) + st->ms_mod_txnid = (st->ms_mod_txnid > db->md_mod_txnid) ? st->ms_mod_txnid + : db->md_mod_txnid; +} + +static __cold int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) { + int err = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + st->ms_psize = txn->mt_env->me_psize; +#if 1 + /* assuming GC is internal and not subject for accounting */ + stat_get(&txn->mt_dbs[MAIN_DBI], st, bytes); +#else + stat_get(&txn->mt_dbs[FREE_DBI], st, bytes); + stat_add(&txn->mt_dbs[MAIN_DBI], st, bytes); +#endif + + /* account opened named subDBs */ + for (MDBX_dbi dbi = CORE_DBS; dbi < txn->mt_numdbs; dbi++) + if ((txn->mt_dbistate[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID) + stat_add(txn->mt_dbs + dbi, st, bytes); + + if (!(txn->mt_dbs[MAIN_DBI].md_flags & (MDBX_DUPSORT | MDBX_INTEGERKEY)) && + txn->mt_dbs[MAIN_DBI].md_entries /* TODO: use `md_subs` field */) { + MDBX_cursor_couple cx; + err = mdbx_cursor_init(&cx.outer, (MDBX_txn *)txn, MAIN_DBI); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + /* scan and account not opened named subDBs */ + err = mdbx_page_search(&cx.outer, NULL, MDBX_PS_FIRST); + while (err == MDBX_SUCCESS) { + const MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top]; + for (unsigned i = 0; i < page_numkeys(mp); i++) { + const MDBX_node *node = page_node(mp, i); + if (node_flags(node) != F_SUBDATA) + continue; + if (unlikely(node_ds(node) != sizeof(MDBX_db))) + return MDBX_CORRUPTED; + + /* skip opened and already accounted */ + for (MDBX_dbi dbi = CORE_DBS; dbi < txn->mt_numdbs; dbi++) + if ((txn->mt_dbistate[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID && + node_ks(node) == txn->mt_dbxs[dbi].md_name.iov_len && + memcmp(node_key(node), txn->mt_dbxs[dbi].md_name.iov_base, + node_ks(node)) == 0) { + node = NULL; + break; + } + + if (node) { + MDBX_db db; + memcpy(&db, node_data(node), sizeof(db)); + stat_add(&db, st, bytes); + } + } + err = mdbx_cursor_sibling(&cx.outer, SIBLING_RIGHT); + } + if (unlikely(err != MDBX_NOTFOUND)) + return err; + } + + return MDBX_SUCCESS; +} + +__cold int mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, + MDBX_stat *dest, size_t bytes) { + if (unlikely(!dest)) + return MDBX_EINVAL; + const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid); + if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid) + return MDBX_EINVAL; + + if (likely(txn)) { + if (env && unlikely(txn->mt_env != env)) + return MDBX_EINVAL; + return stat_acc(txn, dest, bytes); + } + + int err = check_env(env, true); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + if (env->me_txn0 && env->me_txn0->mt_owner == mdbx_thread_self()) + /* inside write-txn */ + return stat_acc(env->me_txn, dest, bytes); + + MDBX_txn *tmp_txn; + err = mdbx_txn_begin((MDBX_env *)env, NULL, MDBX_TXN_RDONLY, &tmp_txn); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + const int rc = stat_acc(tmp_txn, dest, bytes); + err = mdbx_txn_abort(tmp_txn); + if (unlikely(err != MDBX_SUCCESS)) + return err; + return rc; +} + +__cold int mdbx_dbi_dupsort_depthmask(MDBX_txn *txn, MDBX_dbi dbi, + uint32_t *mask) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!mask)) + return MDBX_EINVAL; + + if (unlikely(!check_dbi(txn, dbi, DBI_VALID))) + return MDBX_BAD_DBI; + + MDBX_cursor_couple cx; + rc = mdbx_cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + if ((cx.outer.mc_db->md_flags & MDBX_DUPSORT) == 0) + return MDBX_RESULT_TRUE; + + MDBX_val key, data; + rc = mdbx_cursor_first(&cx.outer, &key, &data); + *mask = 0; + while (rc == MDBX_SUCCESS) { + const MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], + cx.outer.mc_ki[cx.outer.mc_top]); + const MDBX_db *db = node_data(node); + const unsigned flags = node_flags(node); + switch (flags) { + case F_BIGDATA: + case 0: + /* single-value entry, deep = 0 */ + *mask |= 1 << 0; + break; + case F_DUPDATA: + /* single sub-page, deep = 1 */ + *mask |= 1 << 1; + break; + case F_DUPDATA | F_SUBDATA: + /* sub-tree */ + *mask |= 1 << unaligned_peek_u16(1, &db->md_depth); + break; + default: + mdbx_error("wrong node-flags %u", flags); + return MDBX_CORRUPTED; + } + rc = mdbx_cursor_next(&cx.outer, &key, &data, MDBX_NEXT_NODUP); + } + + return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; +} + +#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API +__cold int mdbx_env_info(const MDBX_env *env, MDBX_envinfo *info, + size_t bytes) { + return __inline_mdbx_env_info(env, info, bytes); +} +#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ + +__cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, + MDBX_envinfo *arg, size_t bytes) { + if (unlikely((env == NULL && txn == NULL) || arg == NULL)) + return MDBX_EINVAL; + + if (txn) { + int err = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + if (env) { + int err = check_env(env, false); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (txn && unlikely(txn->mt_env != env)) + return MDBX_EINVAL; + } else { + env = txn->mt_env; + } + + const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid); + const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat); + if (unlikely(bytes != sizeof(MDBX_envinfo)) && bytes != size_before_bootid && + bytes != size_before_pgop_stat) + return MDBX_EINVAL; + + /* is the environment open? (https://github.com/erthink/libmdbx/issues/171) */ + if (unlikely(!env->me_map)) { + /* environment not yet opened */ +#if 1 + /* default behavior: returns the available info but zeroed the rest */ + memset(arg, 0, bytes); + arg->mi_geo.lower = env->me_dbgeo.lower; + arg->mi_geo.upper = env->me_dbgeo.upper; + arg->mi_geo.shrink = env->me_dbgeo.shrink; + arg->mi_geo.grow = env->me_dbgeo.grow; + arg->mi_geo.current = env->me_dbgeo.now; + arg->mi_maxreaders = env->me_maxreaders; + arg->mi_dxb_pagesize = env->me_psize; + arg->mi_sys_pagesize = env->me_os_psize; + if (likely(bytes > size_before_bootid)) { + arg->mi_bootid.current.x = bootid.x; + arg->mi_bootid.current.y = bootid.y; + } + return MDBX_SUCCESS; +#else + /* some users may prefer this behavior: return appropriate error */ + return MDBX_EPERM; +#endif + } + + const MDBX_meta *const meta0 = METAPAGE(env, 0); + const MDBX_meta *const meta1 = METAPAGE(env, 1); + const MDBX_meta *const meta2 = METAPAGE(env, 2); + pgno_t unsynced_pages; + while (1) { + if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) + return MDBX_PANIC; + + const MDBX_meta *const recent_meta = mdbx_meta_head(env); + arg->mi_recent_txnid = mdbx_meta_txnid_fluid(env, recent_meta); + arg->mi_meta0_txnid = mdbx_meta_txnid_fluid(env, meta0); + arg->mi_meta0_sign = unaligned_peek_u64(4, meta0->mm_datasync_sign); + arg->mi_meta1_txnid = mdbx_meta_txnid_fluid(env, meta1); + arg->mi_meta1_sign = unaligned_peek_u64(4, meta1->mm_datasync_sign); + arg->mi_meta2_txnid = mdbx_meta_txnid_fluid(env, meta2); + arg->mi_meta2_sign = unaligned_peek_u64(4, meta2->mm_datasync_sign); + if (likely(bytes > size_before_bootid)) { + memcpy(&arg->mi_bootid.meta0, &meta0->mm_bootid, 16); + memcpy(&arg->mi_bootid.meta1, &meta1->mm_bootid, 16); + memcpy(&arg->mi_bootid.meta2, &meta2->mm_bootid, 16); + } + + const MDBX_meta *txn_meta = recent_meta; + arg->mi_last_pgno = txn_meta->mm_geo.next - 1; + arg->mi_geo.current = pgno2bytes(env, txn_meta->mm_geo.now); + if (txn) { + arg->mi_last_pgno = txn->mt_next_pgno - 1; + arg->mi_geo.current = pgno2bytes(env, txn->mt_end_pgno); + + const txnid_t wanna_meta_txnid = (txn->mt_flags & MDBX_TXN_RDONLY) + ? txn->mt_txnid + : txn->mt_txnid - xMDBX_TXNID_STEP; + txn_meta = (arg->mi_meta0_txnid == wanna_meta_txnid) ? meta0 : txn_meta; + txn_meta = (arg->mi_meta1_txnid == wanna_meta_txnid) ? meta1 : txn_meta; + txn_meta = (arg->mi_meta2_txnid == wanna_meta_txnid) ? meta2 : txn_meta; + } + arg->mi_geo.lower = pgno2bytes(env, txn_meta->mm_geo.lower); + arg->mi_geo.upper = pgno2bytes(env, txn_meta->mm_geo.upper); + arg->mi_geo.shrink = pgno2bytes(env, pv2pages(txn_meta->mm_geo.shrink_pv)); + arg->mi_geo.grow = pgno2bytes(env, pv2pages(txn_meta->mm_geo.grow_pv)); + unsynced_pages = + atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) + + (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != + (uint32_t)arg->mi_last_pgno); + + arg->mi_mapsize = env->me_dxb_mmap.limit; + mdbx_compiler_barrier(); + if (likely(arg->mi_meta0_txnid == mdbx_meta_txnid_fluid(env, meta0) && + arg->mi_meta0_sign == + unaligned_peek_u64(4, meta0->mm_datasync_sign) && + arg->mi_meta1_txnid == mdbx_meta_txnid_fluid(env, meta1) && + arg->mi_meta1_sign == + unaligned_peek_u64(4, meta1->mm_datasync_sign) && + arg->mi_meta2_txnid == mdbx_meta_txnid_fluid(env, meta2) && + arg->mi_meta2_sign == + unaligned_peek_u64(4, meta2->mm_datasync_sign) && + recent_meta == mdbx_meta_head(env) && + arg->mi_recent_txnid == mdbx_meta_txnid_fluid(env, recent_meta))) + break; + } + + const MDBX_lockinfo *const lck = env->me_lck; + arg->mi_maxreaders = env->me_maxreaders; + arg->mi_numreaders = env->me_lck_mmap.lck + ? atomic_load32(&lck->mti_numreaders, mo_Relaxed) + : INT32_MAX; + arg->mi_dxb_pagesize = env->me_psize; + arg->mi_sys_pagesize = env->me_os_psize; + + if (likely(bytes > size_before_bootid)) { + arg->mi_unsync_volume = pgno2bytes(env, unsynced_pages); + const uint64_t monotime_now = mdbx_osal_monotime(); + uint64_t ts = atomic_load64(&lck->mti_sync_timestamp, mo_Relaxed); + arg->mi_since_sync_seconds16dot16 = + ts ? mdbx_osal_monotime_to_16dot16(monotime_now - ts) : 0; + ts = atomic_load64(&lck->mti_reader_check_timestamp, mo_Relaxed); + arg->mi_since_reader_check_seconds16dot16 = + ts ? mdbx_osal_monotime_to_16dot16(monotime_now - ts) : 0; + arg->mi_autosync_threshold = pgno2bytes( + env, atomic_load32(&lck->mti_autosync_threshold, mo_Relaxed)); + arg->mi_autosync_period_seconds16dot16 = mdbx_osal_monotime_to_16dot16( + atomic_load64(&lck->mti_autosync_period, mo_Relaxed)); + arg->mi_bootid.current.x = bootid.x; + arg->mi_bootid.current.y = bootid.y; + arg->mi_mode = env->me_lck_mmap.lck ? lck->mti_envmode.weak : env->me_flags; + } + + if (likely(bytes > size_before_pgop_stat)) { +#if MDBX_ENABLE_PGOP_STAT + arg->mi_pgop_stat.newly = + atomic_load64(&lck->mti_pgop_stat.newly, mo_Relaxed); + arg->mi_pgop_stat.cow = atomic_load64(&lck->mti_pgop_stat.cow, mo_Relaxed); + arg->mi_pgop_stat.clone = + atomic_load64(&lck->mti_pgop_stat.clone, mo_Relaxed); + arg->mi_pgop_stat.split = + atomic_load64(&lck->mti_pgop_stat.split, mo_Relaxed); + arg->mi_pgop_stat.merge = + atomic_load64(&lck->mti_pgop_stat.merge, mo_Relaxed); + arg->mi_pgop_stat.spill = + atomic_load64(&lck->mti_pgop_stat.spill, mo_Relaxed); + arg->mi_pgop_stat.unspill = + atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed); + arg->mi_pgop_stat.wops = + atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed); +#else + memset(&arg->mi_pgop_stat, 0, sizeof(arg->mi_pgop_stat)); +#endif /* MDBX_ENABLE_PGOP_STAT*/ + } + + arg->mi_self_latter_reader_txnid = arg->mi_latter_reader_txnid = 0; + if (lck) { + arg->mi_self_latter_reader_txnid = arg->mi_latter_reader_txnid = + arg->mi_recent_txnid; + for (unsigned i = 0; i < arg->mi_numreaders; ++i) { + const uint32_t pid = + atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); + if (pid) { + const txnid_t txnid = safe64_read(&lck->mti_readers[i].mr_txnid); + if (arg->mi_latter_reader_txnid > txnid) + arg->mi_latter_reader_txnid = txnid; + if (pid == env->me_pid && arg->mi_self_latter_reader_txnid > txnid) + arg->mi_self_latter_reader_txnid = txnid; + } + } + } + + return MDBX_SUCCESS; +} + +static __inline MDBX_cmp_func *get_default_keycmp(unsigned flags) { + return (flags & MDBX_REVERSEKEY) + ? cmp_reverse + : (flags & MDBX_INTEGERKEY) ? cmp_int_align2 : cmp_lexical; +} + +static __inline MDBX_cmp_func *get_default_datacmp(unsigned flags) { + return !(flags & MDBX_DUPSORT) + ? cmp_lenfast + : ((flags & MDBX_INTEGERDUP) + ? cmp_int_unaligned + : ((flags & MDBX_REVERSEDUP) ? cmp_reverse : cmp_lexical)); +} + +static int mdbx_dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { + /* LY: so, accepting only three cases for the table's flags: + * 1) user_flags and both comparators are zero + * = assume that a by-default mode/flags is requested for reading; + * 2) user_flags exactly the same + * = assume that the target mode/flags are requested properly; + * 3) user_flags differs, but table is empty and MDBX_CREATE is provided + * = assume that a properly create request with custom flags; + */ + if ((user_flags ^ txn->mt_dbs[dbi].md_flags) & DB_PERSISTENT_FLAGS) { + /* flags are differs, check other conditions */ + if ((!user_flags && (!keycmp || keycmp == txn->mt_dbxs[dbi].md_cmp) && + (!datacmp || datacmp == txn->mt_dbxs[dbi].md_dcmp)) || + user_flags == MDBX_ACCEDE) { + /* no comparators were provided and flags are zero, + * seems that is case #1 above */ + user_flags = txn->mt_dbs[dbi].md_flags; + } else if ((user_flags & MDBX_CREATE) && txn->mt_dbs[dbi].md_entries == 0) { + if (txn->mt_flags & MDBX_TXN_RDONLY) + return /* FIXME: return extended info */ MDBX_EACCESS; + /* make sure flags changes get committed */ + txn->mt_dbs[dbi].md_flags = user_flags & DB_PERSISTENT_FLAGS; + txn->mt_flags |= MDBX_TXN_DIRTY; + } else { + return /* FIXME: return extended info */ MDBX_INCOMPATIBLE; + } + } + + if (!keycmp) + keycmp = txn->mt_dbxs[dbi].md_cmp ? txn->mt_dbxs[dbi].md_cmp + : get_default_keycmp(user_flags); + if (txn->mt_dbxs[dbi].md_cmp != keycmp) { + if (txn->mt_dbxs[dbi].md_cmp) + return MDBX_EINVAL; + txn->mt_dbxs[dbi].md_cmp = keycmp; + } + + if (!datacmp) + datacmp = txn->mt_dbxs[dbi].md_dcmp ? txn->mt_dbxs[dbi].md_dcmp + : get_default_datacmp(user_flags); + if (txn->mt_dbxs[dbi].md_dcmp != datacmp) { + if (txn->mt_dbxs[dbi].md_dcmp) + return MDBX_EINVAL; + txn->mt_dbxs[dbi].md_dcmp = datacmp; + } + + return MDBX_SUCCESS; +} + +static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, + MDBX_dbi *dbi, MDBX_cmp_func *keycmp, + MDBX_cmp_func *datacmp) { + int rc = MDBX_EINVAL; + if (unlikely(!dbi)) + return rc; + + if (unlikely((user_flags & ~DB_USABLE_FLAGS) != 0)) { + early_bailout: + *dbi = 0; + return rc; + } + + rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + goto early_bailout; + + switch (user_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED | MDBX_DUPSORT | + MDBX_REVERSEDUP | MDBX_ACCEDE)) { + case MDBX_ACCEDE: + if ((user_flags & MDBX_CREATE) == 0) + break; + __fallthrough /* fall through */; + default: + rc = MDBX_EINVAL; + goto early_bailout; + + case MDBX_DUPSORT: + case MDBX_DUPSORT | MDBX_REVERSEDUP: + case MDBX_DUPSORT | MDBX_DUPFIXED: + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP: + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP: + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: + case 0: + break; + } + + /* main table? */ + if (!table_name) { + rc = mdbx_dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp); + if (unlikely(rc != MDBX_SUCCESS)) + goto early_bailout; + *dbi = MAIN_DBI; + return rc; + } + + MDBX_env *env = txn->mt_env; + size_t len = strlen(table_name); + if (len > env->me_leaf_nodemax - NODESIZE - sizeof(MDBX_db)) + return MDBX_EINVAL; + + if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { + txn->mt_dbxs[MAIN_DBI].md_cmp = + get_default_keycmp(txn->mt_dbs[MAIN_DBI].md_flags); + txn->mt_dbxs[MAIN_DBI].md_dcmp = + get_default_datacmp(txn->mt_dbs[MAIN_DBI].md_flags); + } + + /* Is the DB already open? */ + MDBX_dbi scan, slot; + for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) { + if (!txn->mt_dbxs[scan].md_name.iov_len) { + /* Remember this free slot */ + slot = scan; + continue; + } + if (len == txn->mt_dbxs[scan].md_name.iov_len && + !strncmp(table_name, txn->mt_dbxs[scan].md_name.iov_base, len)) { + rc = mdbx_dbi_bind(txn, scan, user_flags, keycmp, datacmp); + if (unlikely(rc != MDBX_SUCCESS)) + goto early_bailout; + *dbi = scan; + return rc; + } + } + + /* Fail, if no free slot and max hit */ + if (unlikely(slot >= env->me_maxdbs)) { + rc = MDBX_DBS_FULL; + goto early_bailout; + } + + /* Cannot mix named table with some main-table flags */ + if (unlikely(txn->mt_dbs[MAIN_DBI].md_flags & + (MDBX_DUPSORT | MDBX_INTEGERKEY))) { + rc = (user_flags & MDBX_CREATE) ? MDBX_INCOMPATIBLE : MDBX_NOTFOUND; + goto early_bailout; + } + + /* Find the DB info */ + MDBX_val key, data; + key.iov_len = len; + key.iov_base = (void *)table_name; + MDBX_cursor_couple couple; + rc = mdbx_cursor_init(&couple.outer, txn, MAIN_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + goto early_bailout; + rc = mdbx_cursor_set(&couple.outer, &key, &data, MDBX_SET).err; + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc != MDBX_NOTFOUND || !(user_flags & MDBX_CREATE)) + goto early_bailout; + } else { + /* make sure this is actually a table */ + MDBX_node *node = page_node(couple.outer.mc_pg[couple.outer.mc_top], + couple.outer.mc_ki[couple.outer.mc_top]); + if (unlikely((node_flags(node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) { + rc = MDBX_INCOMPATIBLE; + goto early_bailout; + } + if (!MDBX_DISABLE_PAGECHECKS && unlikely(data.iov_len != sizeof(MDBX_db))) { + rc = MDBX_CORRUPTED; + goto early_bailout; + } + } + + if (rc != MDBX_SUCCESS && unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) { + rc = MDBX_EACCESS; + goto early_bailout; + } + + /* Done here so we cannot fail after creating a new DB */ + char *namedup = mdbx_strdup(table_name); + if (unlikely(!namedup)) { + rc = MDBX_ENOMEM; + goto early_bailout; + } + + int err = mdbx_fastmutex_acquire(&env->me_dbi_lock); + if (unlikely(err != MDBX_SUCCESS)) { + rc = err; + mdbx_free(namedup); + goto early_bailout; + } + + /* Import handles from env */ + dbi_import_locked(txn); + + /* Rescan after mutex acquisition & import handles */ + for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) { + if (!txn->mt_dbxs[scan].md_name.iov_len) { + /* Remember this free slot */ + slot = scan; + continue; + } + if (len == txn->mt_dbxs[scan].md_name.iov_len && + !strncmp(table_name, txn->mt_dbxs[scan].md_name.iov_base, len)) { + rc = mdbx_dbi_bind(txn, scan, user_flags, keycmp, datacmp); + if (unlikely(rc != MDBX_SUCCESS)) + goto later_bailout; + *dbi = scan; + goto later_exit; + } + } + + if (unlikely(slot >= env->me_maxdbs)) { + rc = MDBX_DBS_FULL; + goto later_bailout; + } + + unsigned dbiflags = DBI_FRESH | DBI_VALID | DBI_USRVALID; + MDBX_db db_dummy; + if (unlikely(rc)) { + /* MDBX_NOTFOUND and MDBX_CREATE: Create new DB */ + mdbx_tassert(txn, rc == MDBX_NOTFOUND); + memset(&db_dummy, 0, sizeof(db_dummy)); + db_dummy.md_root = P_INVALID; + db_dummy.md_mod_txnid = txn->mt_txnid; + db_dummy.md_flags = user_flags & DB_PERSISTENT_FLAGS; + data.iov_len = sizeof(db_dummy); + data.iov_base = &db_dummy; + WITH_CURSOR_TRACKING(couple.outer, + rc = mdbx_cursor_put(&couple.outer, &key, &data, + F_SUBDATA | MDBX_NOOVERWRITE)); + + if (unlikely(rc != MDBX_SUCCESS)) + goto later_bailout; + + dbiflags |= DBI_DIRTY | DBI_CREAT; + txn->mt_flags |= MDBX_TXN_DIRTY; + } + + /* Got info, register DBI in this txn */ + memset(txn->mt_dbxs + slot, 0, sizeof(MDBX_dbx)); + txn->mt_dbs[slot] = *(MDBX_db *)data.iov_base; + env->me_dbflags[slot] = 0; + rc = mdbx_dbi_bind(txn, slot, user_flags, keycmp, datacmp); + if (unlikely(rc != MDBX_SUCCESS)) { + mdbx_tassert(txn, (dbiflags & DBI_CREAT) == 0); + later_bailout: + *dbi = 0; + later_exit: + mdbx_free(namedup); + } else { + txn->mt_dbistate[slot] = (uint8_t)dbiflags; + txn->mt_dbxs[slot].md_name.iov_base = namedup; + txn->mt_dbxs[slot].md_name.iov_len = len; + txn->mt_dbiseqs[slot] = ++env->me_dbiseqs[slot]; + if (!(dbiflags & DBI_CREAT)) + env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID; + if (txn->mt_numdbs == slot) { + mdbx_compiler_barrier(); + txn->mt_numdbs = env->me_numdbs = slot + 1; + if (!(txn->mt_flags & MDBX_TXN_RDONLY)) + txn->tw.cursors[slot] = NULL; + } + mdbx_assert(env, env->me_numdbs > slot); + *dbi = slot; + } + + mdbx_ensure(env, mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + return rc; +} + +int mdbx_dbi_open(MDBX_txn *txn, const char *table_name, + MDBX_db_flags_t table_flags, MDBX_dbi *dbi) { + return dbi_open(txn, table_name, table_flags, dbi, nullptr, nullptr); +} + +int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, + MDBX_db_flags_t table_flags, MDBX_dbi *dbi, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { + return dbi_open(txn, table_name, table_flags, dbi, keycmp, datacmp); +} + +__cold int mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, + size_t bytes) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!dest)) + return MDBX_EINVAL; + + if (unlikely(!check_dbi(txn, dbi, DBI_VALID))) + return MDBX_BAD_DBI; + + const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid); + if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid) + return MDBX_EINVAL; + + if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) + return MDBX_BAD_TXN; + + if (unlikely(txn->mt_dbistate[dbi] & DBI_STALE)) { + rc = mdbx_fetch_sdb(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + + dest->ms_psize = txn->mt_env->me_psize; + stat_get(&txn->mt_dbs[dbi], dest, bytes); + return MDBX_SUCCESS; +} + +static int mdbx_dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { + mdbx_assert(env, dbi >= CORE_DBS); + if (unlikely(dbi >= env->me_numdbs)) + return MDBX_BAD_DBI; + + char *ptr = env->me_dbxs[dbi].md_name.iov_base; + /* If there was no name, this was already closed */ + if (unlikely(!ptr)) + return MDBX_BAD_DBI; + + env->me_dbflags[dbi] = 0; + env->me_dbiseqs[dbi]++; + env->me_dbxs[dbi].md_name.iov_len = 0; + mdbx_memory_fence(mo_AcquireRelease, true); + env->me_dbxs[dbi].md_name.iov_base = NULL; + mdbx_free(ptr); + + if (env->me_numdbs == dbi + 1) { + unsigned i = env->me_numdbs; + do + --i; + while (i > CORE_DBS && !env->me_dbxs[i - 1].md_name.iov_base); + env->me_numdbs = i; + } + + return MDBX_SUCCESS; +} + +int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(dbi < CORE_DBS || dbi >= env->me_maxdbs)) + return MDBX_BAD_DBI; + + rc = mdbx_fastmutex_acquire(&env->me_dbi_lock); + if (likely(rc == MDBX_SUCCESS)) { + rc = (dbi < env->me_maxdbs && (env->me_dbflags[dbi] & DB_VALID)) + ? mdbx_dbi_close_locked(env, dbi) + : MDBX_BAD_DBI; + mdbx_ensure(env, mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + } + return rc; +} + +int mdbx_dbi_flags_ex(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags, + unsigned *state) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!flags || !state)) + return MDBX_EINVAL; + + if (unlikely(!check_dbi(txn, dbi, DBI_VALID))) + return MDBX_BAD_DBI; + + *flags = txn->mt_dbs[dbi].md_flags & DB_PERSISTENT_FLAGS; + *state = + txn->mt_dbistate[dbi] & (DBI_FRESH | DBI_CREAT | DBI_DIRTY | DBI_STALE); + + return MDBX_SUCCESS; +} + +#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API +int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags) { + return __inline_mdbx_dbi_flags(txn, dbi, flags); +} +#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ + +static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { + int rc = mdbx_page_search(mc, NULL, MDBX_PS_FIRST); + if (likely(rc == MDBX_SUCCESS)) { + MDBX_txn *txn = mc->mc_txn; + + /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. + * This also avoids any P_LEAF2 pages, which have no nodes. + * Also if the DB doesn't have sub-DBs and has no overflow + * pages, omit scanning leaves. */ + if (!(may_have_subDBs | mc->mc_db->md_overflow_pages)) + mdbx_cursor_pop(mc); + + rc = mdbx_pnl_need(&txn->tw.retired_pages, + mc->mc_db->md_branch_pages + mc->mc_db->md_leaf_pages + + mc->mc_db->md_overflow_pages); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + MDBX_cursor mx; + cursor_copy(mc, &mx); + while (mc->mc_snum > 0) { + MDBX_page *const mp = mc->mc_pg[mc->mc_top]; + const unsigned nkeys = page_numkeys(mp); + if (IS_LEAF(mp)) { + mdbx_cassert(mc, mc->mc_snum == mc->mc_db->md_depth); + for (unsigned i = 0; i < nkeys; i++) { + MDBX_node *node = page_node(mp, i); + if (node_flags(node) & F_BIGDATA) { + rc = mdbx_page_retire_ex(mc, node_largedata_pgno(node), NULL, 0); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + if (!(may_have_subDBs | mc->mc_db->md_overflow_pages)) + goto pop; + } else if (node_flags(node) & F_SUBDATA) { + if (unlikely((node_flags(node) & F_DUPDATA) == 0)) { + rc = /* disallowing implicit subDB deletion */ MDBX_INCOMPATIBLE; + goto bailout; + } + rc = mdbx_xcursor_init1(mc, node, mp); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + rc = mdbx_drop_tree(&mc->mc_xcursor->mx_cursor, false); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + } + } else { + mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth); + if (mdbx_audit_enabled()) + mc->mc_flags |= C_RETIRING; + const int pagetype = + (IS_FROZEN(txn, mp) ? P_FROZEN : 0) + + ((mc->mc_snum + 1 == mc->mc_db->md_depth) ? P_LEAF : P_BRANCH); + for (unsigned i = 0; i < nkeys; i++) { + MDBX_node *node = page_node(mp, i); + mdbx_tassert(txn, (node_flags(node) & + (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); + const pgno_t pgno = node_pgno(node); + rc = mdbx_page_retire_ex(mc, pgno, NULL, pagetype); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + if (mdbx_audit_enabled()) + mc->mc_flags -= C_RETIRING; + } + if (!mc->mc_top) + break; + mdbx_cassert(mc, nkeys > 0); + mc->mc_ki[mc->mc_top] = (indx_t)nkeys; + rc = mdbx_cursor_sibling(mc, SIBLING_RIGHT); + if (unlikely(rc != MDBX_SUCCESS)) { + if (unlikely(rc != MDBX_NOTFOUND)) + goto bailout; + /* no more siblings, go back to beginning + * of previous level. */ + pop: + mdbx_cursor_pop(mc); + mc->mc_ki[0] = 0; + for (unsigned i = 1; i < mc->mc_snum; i++) { + mc->mc_ki[i] = 0; + mc->mc_pg[i] = mx.mc_pg[i]; + } + } + } + rc = mdbx_page_retire(mc, mc->mc_pg[0]); + bailout: + if (unlikely(rc != MDBX_SUCCESS)) + txn->mt_flags |= MDBX_TXN_ERROR; + } else if (rc == MDBX_NOTFOUND) { + rc = MDBX_SUCCESS; + } + mc->mc_flags &= ~C_INITIALIZED; + return rc; +} + +int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + MDBX_cursor *mc; + rc = mdbx_cursor_open(txn, dbi, &mc); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + rc = mdbx_drop_tree(mc, dbi == MAIN_DBI || + (mc->mc_db->md_flags & MDBX_DUPSORT) != 0); + /* Invalidate the dropped DB's cursors */ + for (MDBX_cursor *m2 = txn->tw.cursors[dbi]; m2; m2 = m2->mc_next) + m2->mc_flags &= ~(C_INITIALIZED | C_EOF); + if (unlikely(rc)) + goto bailout; + + /* Can't delete the main DB */ + if (del && dbi >= CORE_DBS) { + rc = mdbx_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); + if (likely(rc == MDBX_SUCCESS)) { + mdbx_tassert(txn, txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY); + mdbx_tassert(txn, txn->mt_flags & MDBX_TXN_DIRTY); + txn->mt_dbistate[dbi] = DBI_STALE; + MDBX_env *env = txn->mt_env; + rc = mdbx_fastmutex_acquire(&env->me_dbi_lock); + if (unlikely(rc != MDBX_SUCCESS)) { + txn->mt_flags |= MDBX_TXN_ERROR; + goto bailout; + } + mdbx_dbi_close_locked(env, dbi); + mdbx_ensure(env, + mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + } else { + txn->mt_flags |= MDBX_TXN_ERROR; + } + } else { + /* reset the DB record, mark it dirty */ + txn->mt_dbistate[dbi] |= DBI_DIRTY; + txn->mt_dbs[dbi].md_depth = 0; + txn->mt_dbs[dbi].md_branch_pages = 0; + txn->mt_dbs[dbi].md_leaf_pages = 0; + txn->mt_dbs[dbi].md_overflow_pages = 0; + txn->mt_dbs[dbi].md_entries = 0; + txn->mt_dbs[dbi].md_root = P_INVALID; + txn->mt_dbs[dbi].md_seq = 0; + txn->mt_flags |= MDBX_TXN_DIRTY; + } + +bailout: + mdbx_cursor_close(mc); + return rc; +} + +int mdbx_set_compare(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cmp_func *cmp) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) + return MDBX_BAD_DBI; + + txn->mt_dbxs[dbi].md_cmp = cmp; + return MDBX_SUCCESS; +} + +int mdbx_set_dupsort(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cmp_func *cmp) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) + return MDBX_BAD_DBI; + + txn->mt_dbxs[dbi].md_dcmp = cmp; + return MDBX_SUCCESS; +} + +__cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, + void *ctx) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!func)) + return MDBX_EINVAL; + + rc = MDBX_RESULT_TRUE; + int serial = 0; + MDBX_lockinfo *const lck = env->me_lck_mmap.lck; + if (likely(lck)) { + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); + for (unsigned i = 0; i < snap_nreaders; i++) { + const MDBX_reader *r = lck->mti_readers + i; + retry_reader:; + const uint32_t pid = atomic_load32(&r->mr_pid, mo_AcquireRelease); + if (!pid) + continue; + txnid_t txnid = safe64_read(&r->mr_txnid); + const uint64_t tid = atomic_load64(&r->mr_tid, mo_Relaxed); + const pgno_t pages_used = + atomic_load32(&r->mr_snapshot_pages_used, mo_Relaxed); + const uint64_t reader_pages_retired = + atomic_load64(&r->mr_snapshot_pages_retired, mo_Relaxed); + if (unlikely( + txnid != safe64_read(&r->mr_txnid) || + pid != atomic_load32(&r->mr_pid, mo_AcquireRelease) || + tid != atomic_load64(&r->mr_tid, mo_Relaxed) || + pages_used != + atomic_load32(&r->mr_snapshot_pages_used, mo_Relaxed) || + reader_pages_retired != + atomic_load64(&r->mr_snapshot_pages_retired, mo_Relaxed))) + goto retry_reader; + + mdbx_assert(env, txnid > 0); + if (txnid >= SAFE64_INVALID_THRESHOLD) + txnid = 0; + + size_t bytes_used = 0; + size_t bytes_retained = 0; + uint64_t lag = 0; + if (txnid) { + retry_header:; + const MDBX_meta *const recent_meta = mdbx_meta_head(env); + const uint64_t head_pages_retired = + unaligned_peek_u64(4, recent_meta->mm_pages_retired); + const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, recent_meta); + mdbx_compiler_barrier(); + if (unlikely( + recent_meta != mdbx_meta_head(env) || + head_pages_retired != + unaligned_peek_u64(4, recent_meta->mm_pages_retired)) || + head_txnid != mdbx_meta_txnid_fluid(env, recent_meta)) + goto retry_header; + + lag = (head_txnid - txnid) / xMDBX_TXNID_STEP; + bytes_used = pgno2bytes(env, pages_used); + bytes_retained = (head_pages_retired > reader_pages_retired) + ? pgno2bytes(env, (pgno_t)(head_pages_retired - + reader_pages_retired)) + : 0; + } + rc = func(ctx, ++serial, i, pid, (mdbx_tid_t)tid, txnid, lag, bytes_used, + bytes_retained); + if (unlikely(rc != MDBX_SUCCESS)) + break; + } + } + + return rc; +} + +/* Insert pid into list if not already present. + * return -1 if already present. */ +static bool __cold mdbx_pid_insert(uint32_t *ids, uint32_t pid) { + /* binary search of pid in list */ + unsigned base = 0; + unsigned cursor = 1; + int val = 0; + unsigned n = ids[0]; + + while (n > 0) { + unsigned pivot = n >> 1; + cursor = base + pivot + 1; + val = pid - ids[cursor]; + + if (val < 0) { + n = pivot; + } else if (val > 0) { + base = cursor; + n -= pivot + 1; + } else { + /* found, so it's a duplicate */ + return false; + } + } + + if (val > 0) + ++cursor; + + ids[0]++; + for (n = ids[0]; n > cursor; n--) + ids[n] = ids[n - 1]; + ids[n] = pid; + return true; +} + +__cold int mdbx_reader_check(MDBX_env *env, int *dead) { + if (dead) + *dead = 0; + return mdbx_cleanup_dead_readers(env, false, dead); +} + +/* Return: + * MDBX_RESULT_TRUE - done and mutex recovered + * MDBX_SUCCESS - done + * Otherwise errcode. */ +MDBX_INTERNAL_FUNC __cold int +mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + mdbx_assert(env, rdt_locked >= 0); + MDBX_lockinfo *const lck = env->me_lck_mmap.lck; + if (unlikely(lck == NULL)) { + /* exclusive mode */ + if (dead) + *dead = 0; + return MDBX_SUCCESS; + } + + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); + uint32_t pidsbuf_onstask[142]; + uint32_t *const pids = + (snap_nreaders < ARRAY_LENGTH(pidsbuf_onstask)) + ? pidsbuf_onstask + : mdbx_malloc((snap_nreaders + 1) * sizeof(uint32_t)); + if (unlikely(!pids)) + return MDBX_ENOMEM; + + pids[0] = 0; + int count = 0; + for (unsigned i = 0; i < snap_nreaders; i++) { + const uint32_t pid = + atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); + if (pid == 0) + continue /* skip empty */; + if (pid == env->me_pid) + continue /* skip self */; + if (!mdbx_pid_insert(pids, pid)) + continue /* such pid already processed */; + + int err = mdbx_rpid_check(env, pid); + if (err == MDBX_RESULT_TRUE) + continue /* reader is live */; + + if (err != MDBX_SUCCESS) { + rc = err; + break /* mdbx_rpid_check() failed */; + } + + /* stale reader found */ + if (!rdt_locked) { + err = mdbx_rdt_lock(env); + if (MDBX_IS_ERROR(err)) { + rc = err; + break; + } + + rdt_locked = -1; + if (err == MDBX_RESULT_TRUE) { + /* mutex recovered, the mdbx_ipclock_failed() checked all readers */ + rc = MDBX_RESULT_TRUE; + break; + } + + /* a other process may have clean and reused slot, recheck */ + if (lck->mti_readers[i].mr_pid.weak != pid) + continue; + + err = mdbx_rpid_check(env, pid); + if (MDBX_IS_ERROR(err)) { + rc = err; + break; + } + + if (err != MDBX_SUCCESS) + continue /* the race with other process, slot reused */; + } + + /* clean it */ + for (unsigned j = i; j < snap_nreaders; j++) { + if (lck->mti_readers[j].mr_pid.weak == pid) { + mdbx_debug("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN, + (size_t)pid, lck->mti_readers[j].mr_txnid.weak); + atomic_store32(&lck->mti_readers[j].mr_pid, 0, mo_Relaxed); + atomic_store32(&lck->mti_readers_refresh_flag, true, mo_AcquireRelease); + count++; + } + } + } + + if (likely(!MDBX_IS_ERROR(rc))) + atomic_store64(&lck->mti_reader_check_timestamp, mdbx_osal_monotime(), + mo_Relaxed); + + if (rdt_locked < 0) + mdbx_rdt_unlock(env); + + if (pids != pidsbuf_onstask) + mdbx_free(pids); + + if (dead) + *dead = count; + return rc; +} + +__cold int mdbx_setup_debug(int loglevel, int flags, MDBX_debug_func *logger) { + const int rc = mdbx_runtime_flags | (mdbx_loglevel << 16); + + if (loglevel != MDBX_LOG_DONTCHANGE) + mdbx_loglevel = (uint8_t)loglevel; + + if (flags != MDBX_DBG_DONTCHANGE) { + flags &= +#if MDBX_DEBUG + MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | MDBX_DBG_JITTER | +#endif + MDBX_DBG_DUMP | MDBX_DBG_LEGACY_MULTIOPEN | MDBX_DBG_LEGACY_OVERLAP; + mdbx_runtime_flags = (uint8_t)flags; + } + + if (logger != MDBX_LOGGER_DONTCHANGE) + mdbx_debug_logger = logger; + return rc; +} + +static txnid_t __cold mdbx_kick_longlived_readers(MDBX_env *env, + const txnid_t laggard) { + mdbx_debug("DB size maxed out by reading #%" PRIaTXN, laggard); + + int retry; + for (retry = 0; retry < INT_MAX; ++retry) { + txnid_t oldest = mdbx_recent_steady_txnid(env); + mdbx_assert(env, oldest < env->me_txn0->mt_txnid); + mdbx_assert(env, oldest >= laggard); + mdbx_assert(env, oldest >= env->me_lck->mti_oldest_reader.weak); + MDBX_lockinfo *const lck = env->me_lck_mmap.lck; + if (oldest == laggard || unlikely(!lck /* without-LCK mode */)) + return oldest; + + if (MDBX_IS_ERROR(mdbx_cleanup_dead_readers(env, false, NULL))) + break; + + MDBX_reader *asleep = nullptr; + uint64_t oldest_retired = UINT64_MAX; + const unsigned snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); + for (unsigned i = 0; i < snap_nreaders; ++i) { + retry: + if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { + /* mdbx_jitter4testing(true); */ + const uint64_t snap_retired = atomic_load64( + &lck->mti_readers[i].mr_snapshot_pages_retired, mo_Relaxed); + const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); + if (unlikely(snap_retired != + atomic_load64( + &lck->mti_readers[i].mr_snapshot_pages_retired, + mo_AcquireRelease) || + snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) + goto retry; + if (oldest > snap_txnid && + laggard <= /* ignore pending updates */ snap_txnid) { + oldest = snap_txnid; + oldest_retired = snap_retired; + asleep = &lck->mti_readers[i]; + } + } + } + + if (laggard < oldest || !asleep) { + if (retry && env->me_hsr_callback) { + /* LY: notify end of hsr-loop */ + const txnid_t gap = oldest - laggard; + env->me_hsr_callback(env, env->me_txn, 0, 0, laggard, + (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, 0, + -retry); + } + mdbx_notice("hsr-kick: update oldest %" PRIaTXN " -> %" PRIaTXN, + lck->mti_oldest_reader.weak, oldest); + mdbx_assert(env, lck->mti_oldest_reader.weak <= oldest); + return atomic_store64(&lck->mti_oldest_reader, oldest, mo_Relaxed); + } + + if (!env->me_hsr_callback) + break; + + uint32_t pid = atomic_load32(&asleep->mr_pid, mo_AcquireRelease); + uint64_t tid = asleep->mr_tid.weak; + if (safe64_read(&asleep->mr_txnid) != laggard || pid <= 0) + continue; + + const MDBX_meta *head_meta = mdbx_meta_head(env); + const txnid_t gap = + (mdbx_meta_txnid_stable(env, head_meta) - laggard) / xMDBX_TXNID_STEP; + const uint64_t head_retired = + unaligned_peek_u64(4, head_meta->mm_pages_retired); + const size_t space = + (oldest_retired > head_retired) + ? pgno2bytes(env, (pgno_t)(oldest_retired - head_retired)) + : 0; + int rc = env->me_hsr_callback( + env, env->me_txn, pid, (mdbx_tid_t)tid, laggard, + (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, space, retry); + if (rc < 0) + break; + + if (rc > 0) { + if (rc == 1) { + safe64_reset_compare(&asleep->mr_txnid, laggard); + } else { + safe64_reset(&asleep->mr_txnid, true); + atomic_store64(&asleep->mr_tid, 0, mo_Relaxed); + atomic_store32(&asleep->mr_pid, 0, mo_Relaxed); + } + atomic_store32(&lck->mti_readers_refresh_flag, true, mo_Relaxed); + } + } + + if (retry && env->me_hsr_callback) { + /* LY: notify end of hsr-loop */ + env->me_hsr_callback(env, env->me_txn, 0, 0, laggard, 0, 0, -retry); + } + return mdbx_find_oldest(env->me_txn); +} + +#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API +__cold int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold) { + return __inline_mdbx_env_set_syncbytes(env, threshold); +} + +__cold int mdbx_env_set_syncperiod(MDBX_env *env, unsigned seconds_16dot16) { + return __inline_mdbx_env_set_syncperiod(env, seconds_16dot16); +} +#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ + +__cold int mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr) { + int rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + env->me_hsr_callback = hsr; + return MDBX_SUCCESS; +} + +MDBX_hsr_func *__cold mdbx_env_get_hsr(const MDBX_env *env) { + return likely(env && env->me_signature.weak == MDBX_ME_SIGNATURE) + ? env->me_hsr_callback + : NULL; +} + +#ifdef __SANITIZE_THREAD__ +/* LY: avoid tsan-trap by me_txn, mm_last_pg and mt_next_pgno */ +__attribute__((__no_sanitize_thread__, __noinline__)) +#endif +int mdbx_txn_straggler(const MDBX_txn *txn, int *percent) +{ + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return (rc > 0) ? -rc : rc; + + MDBX_env *env = txn->mt_env; + if (unlikely((txn->mt_flags & MDBX_TXN_RDONLY) == 0)) { + if (percent) + *percent = + (int)((txn->mt_next_pgno * UINT64_C(100) + txn->mt_end_pgno / 2) / + txn->mt_end_pgno); + return 0; + } + + txnid_t recent; + MDBX_meta *meta; + do { + meta = mdbx_meta_head(env); + recent = mdbx_meta_txnid_fluid(env, meta); + if (percent) { + const pgno_t maxpg = meta->mm_geo.now; + *percent = (int)((meta->mm_geo.next * UINT64_C(100) + maxpg / 2) / maxpg); + } + } while (unlikely(recent != mdbx_meta_txnid_fluid(env, meta))); + + txnid_t lag = (recent - txn->mt_txnid) / xMDBX_TXNID_STEP; + return (lag > INT_MAX) ? INT_MAX : (int)lag; +} + +typedef struct mdbx_walk_ctx { + void *mw_user; + MDBX_pgvisitor_func *mw_visitor; + MDBX_txn *mw_txn; + MDBX_cursor *mw_cursor; + bool mw_dont_check_keys_ordering; +} mdbx_walk_ctx_t; + +static __cold int mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const db, + const char *name, int deep); + +static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { + if (mp) + switch (mp->mp_flags) { + case P_BRANCH: + return MDBX_page_branch; + case P_LEAF: + return MDBX_page_leaf; + case P_LEAF | P_LEAF2: + return MDBX_page_dupfixed_leaf; + case P_OVERFLOW: + return MDBX_page_large; + case P_META: + return MDBX_page_meta; + } + return MDBX_page_broken; +} + +/* Depth-first tree traversal. */ +static __cold int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, + const char *name, int deep, + txnid_t parent_txnid) { + assert(pgno != P_INVALID); + MDBX_page *mp = nullptr; + int rc, err = mdbx_page_get(ctx->mw_cursor, pgno, &mp, parent_txnid); + if (err == MDBX_SUCCESS) + err = mdbx_page_check(ctx->mw_cursor, mp, 0); + + MDBX_page_type_t type = walk_page_type(mp); + const int nentries = (mp && !IS_OVERFLOW(mp)) ? page_numkeys(mp) : 1; + unsigned npages = (mp && IS_OVERFLOW(mp)) ? mp->mp_pages : 1; + size_t pagesize = pgno2bytes(ctx->mw_txn->mt_env, npages); + size_t header_size = (mp && !IS_LEAF2(mp) && !IS_OVERFLOW(mp)) + ? PAGEHDRSZ + mp->mp_lower + : PAGEHDRSZ; + size_t payload_size = 0; + size_t unused_size = + (mp && !IS_OVERFLOW(mp) ? page_room(mp) : pagesize - header_size) - + payload_size; + size_t align_bytes = 0; + + if (err == MDBX_SUCCESS) { + /* LY: Don't use mask here, e.g bitwise + * (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP). + * Pages should not me marked dirty/loose or otherwise. */ + switch (mp->mp_flags) { + default: + err = MDBX_CORRUPTED; + break; + case P_BRANCH: + if (unlikely(nentries < 2)) + err = MDBX_CORRUPTED; + case P_LEAF: + case P_LEAF | P_LEAF2: + break; + } + } + + for (int i = 0; err == MDBX_SUCCESS && i < nentries; + align_bytes += ((payload_size + align_bytes) & 1), i++) { + if (type == MDBX_page_dupfixed_leaf) { + /* LEAF2 pages have no mp_ptrs[] or node headers */ + payload_size += mp->mp_leaf2_ksize; + continue; + } + + MDBX_node *node = page_node(mp, i); + payload_size += NODESIZE + node_ks(node); + + if (type == MDBX_page_branch) { + assert(i > 0 || node_ks(node) == 0); + continue; + } + + assert(type == MDBX_page_leaf); + switch (node_flags(node)) { + case 0 /* usual node */: + payload_size += node_ds(node); + break; + + case F_BIGDATA /* long data on the large/overflow page */: { + payload_size += sizeof(pgno_t); + const pgno_t large_pgno = node_largedata_pgno(node); + const size_t over_payload = node_ds(node); + const size_t over_header = PAGEHDRSZ; + npages = 1; + + MDBX_page *op; + err = mdbx_page_get(ctx->mw_cursor, large_pgno, &op, + pp_txnid4chk(mp, ctx->mw_txn)); + if (err == MDBX_SUCCESS) + err = mdbx_page_check(ctx->mw_cursor, op, 0); + if (err == MDBX_SUCCESS) { + /* LY: Don't use mask here, e.g bitwise + * (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP). + * Pages should not me marked dirty/loose or otherwise. */ + if (unlikely(P_OVERFLOW != op->mp_flags)) + err = bad_page(mp, "wrong page type %d for large data", op->mp_flags); + else + npages = op->mp_pages; + } + + pagesize = pgno2bytes(ctx->mw_txn->mt_env, npages); + const size_t over_unused = pagesize - over_payload - over_header; + rc = ctx->mw_visitor(large_pgno, npages, ctx->mw_user, deep, name, + pagesize, MDBX_page_large, err, 1, over_payload, + over_header, over_unused); + if (unlikely(rc != MDBX_SUCCESS)) + return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; + } break; + + case F_SUBDATA /* sub-db */: { + const size_t namelen = node_ks(node); + payload_size += node_ds(node); + if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) + err = MDBX_CORRUPTED; + } break; + + case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: + payload_size += sizeof(MDBX_db); + if (unlikely(node_ds(node) != sizeof(MDBX_db))) + err = MDBX_CORRUPTED; + break; + + case F_DUPDATA /* short sub-page */: { + if (unlikely(node_ds(node) <= PAGEHDRSZ)) { + err = MDBX_CORRUPTED; + break; + } + + MDBX_page *sp = node_data(node); + const int nsubkeys = page_numkeys(sp); + size_t subheader_size = + IS_LEAF2(sp) ? PAGEHDRSZ : PAGEHDRSZ + sp->mp_lower; + size_t subunused_size = page_room(sp); + size_t subpayload_size = 0; + size_t subalign_bytes = 0; + MDBX_page_type_t subtype; + + switch (sp->mp_flags) { + case P_LEAF | P_SUBP: + subtype = MDBX_subpage_leaf; + break; + case P_LEAF | P_LEAF2 | P_SUBP: + subtype = MDBX_subpage_dupfixed_leaf; + break; + default: + subtype = MDBX_subpage_broken; + err = MDBX_CORRUPTED; + } + + for (int j = 0; err == MDBX_SUCCESS && j < nsubkeys; + subalign_bytes += ((subpayload_size + subalign_bytes) & 1), j++) { + + if (subtype == MDBX_subpage_dupfixed_leaf) { + /* LEAF2 pages have no mp_ptrs[] or node headers */ + subpayload_size += sp->mp_leaf2_ksize; + } else { + assert(subtype == MDBX_subpage_leaf); + MDBX_node *subnode = page_node(sp, j); + subpayload_size += NODESIZE + node_ks(subnode) + node_ds(subnode); + if (unlikely(node_flags(subnode) != 0)) + err = MDBX_CORRUPTED; + } + } + + rc = ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, name, node_ds(node), + subtype, err, nsubkeys, subpayload_size, + subheader_size, subunused_size + subalign_bytes); + if (unlikely(rc != MDBX_SUCCESS)) + return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; + header_size += subheader_size; + unused_size += subunused_size; + payload_size += subpayload_size; + align_bytes += subalign_bytes; + } break; + + default: + err = MDBX_CORRUPTED; + } + } + + rc = ctx->mw_visitor(pgno, 1, ctx->mw_user, deep, name, + ctx->mw_txn->mt_env->me_psize, type, err, nentries, + payload_size, header_size, unused_size + align_bytes); + if (unlikely(rc != MDBX_SUCCESS)) + return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; + + for (int i = 0; err == MDBX_SUCCESS && i < nentries; i++) { + if (type == MDBX_page_dupfixed_leaf) + continue; + + MDBX_node *node = page_node(mp, i); + if (type == MDBX_page_branch) { + err = mdbx_walk_tree(ctx, node_pgno(node), name, deep + 1, + pp_txnid4chk(mp, ctx->mw_txn)); + if (unlikely(err != MDBX_SUCCESS)) { + if (err == MDBX_RESULT_TRUE) + break; + return err; + } + continue; + } + + assert(type == MDBX_page_leaf); + MDBX_db db; + switch (node_flags(node)) { + default: + continue; + + case F_SUBDATA /* sub-db */: { + const size_t namelen = node_ks(node); + if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) { + err = MDBX_CORRUPTED; + break; + } + + char namebuf_onstask[64]; + char *const sub_name = (namelen < sizeof(namebuf_onstask)) + ? namebuf_onstask + : mdbx_malloc(namelen + 1); + if (sub_name) { + memcpy(sub_name, node_key(node), namelen); + sub_name[namelen] = 0; + memcpy(&db, node_data(node), sizeof(db)); + err = mdbx_walk_sdb(ctx, &db, sub_name, deep + 1); + if (sub_name != namebuf_onstask) + mdbx_free(sub_name); + } else { + err = MDBX_ENOMEM; + } + } break; + + case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: + if (unlikely(node_ds(node) != sizeof(MDBX_db) || + ctx->mw_cursor->mc_xcursor == NULL)) + err = MDBX_CORRUPTED; + else { + memcpy(&db, node_data(node), sizeof(db)); + assert(ctx->mw_cursor->mc_xcursor == + &container_of(ctx->mw_cursor, MDBX_cursor_couple, outer)->inner); + ctx->mw_cursor = &ctx->mw_cursor->mc_xcursor->mx_cursor; + err = mdbx_walk_tree(ctx, db.md_root, name, deep + 1, + pp_txnid4chk(mp, ctx->mw_txn)); + MDBX_xcursor *inner_xcursor = + container_of(ctx->mw_cursor, MDBX_xcursor, mx_cursor); + MDBX_cursor_couple *couple = + container_of(inner_xcursor, MDBX_cursor_couple, inner); + ctx->mw_cursor = &couple->outer; + } + break; + } + } + + return MDBX_SUCCESS; +} + +static __cold int mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const db, + const char *name, int deep) { + if (unlikely(db->md_root == P_INVALID)) + return MDBX_SUCCESS; /* empty db */ + + MDBX_cursor_couple couple; + MDBX_dbx dbx = {.md_klen_min = INT_MAX}; + uint8_t dbistate = DBI_VALID | DBI_AUDITED; + int rc = mdbx_couple_init(&couple, ~0u, ctx->mw_txn, db, &dbx, &dbistate); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (ctx->mw_dont_check_keys_ordering) { + couple.outer.mc_flags |= C_SKIPORD; + couple.inner.mx_cursor.mc_flags |= C_SKIPORD; + } + couple.outer.mc_next = ctx->mw_cursor; + ctx->mw_cursor = &couple.outer; + rc = mdbx_walk_tree(ctx, db->md_root, name, deep, ctx->mw_txn->mt_txnid); + ctx->mw_cursor = couple.outer.mc_next; + return rc; +} + +__cold int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, + void *user, bool dont_check_keys_ordering) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + mdbx_walk_ctx_t ctx; + memset(&ctx, 0, sizeof(ctx)); + ctx.mw_txn = txn; + ctx.mw_user = user; + ctx.mw_visitor = visitor; + ctx.mw_dont_check_keys_ordering = dont_check_keys_ordering; + + rc = visitor(0, NUM_METAS, user, 0, MDBX_PGWALK_META, + pgno2bytes(txn->mt_env, NUM_METAS), MDBX_page_meta, MDBX_SUCCESS, + NUM_METAS, sizeof(MDBX_meta) * NUM_METAS, PAGEHDRSZ * NUM_METAS, + (txn->mt_env->me_psize - sizeof(MDBX_meta) - PAGEHDRSZ) * + NUM_METAS); + if (!MDBX_IS_ERROR(rc)) + rc = mdbx_walk_sdb(&ctx, &txn->mt_dbs[FREE_DBI], MDBX_PGWALK_GC, 0); + if (!MDBX_IS_ERROR(rc)) + rc = mdbx_walk_sdb(&ctx, &txn->mt_dbs[MAIN_DBI], MDBX_PGWALK_MAIN, 0); + return rc; +} + +int mdbx_canary_put(MDBX_txn *txn, const MDBX_canary *canary) { + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (likely(canary)) { + if (txn->mt_canary.x == canary->x && txn->mt_canary.y == canary->y && + txn->mt_canary.z == canary->z) + return MDBX_SUCCESS; + txn->mt_canary.x = canary->x; + txn->mt_canary.y = canary->y; + txn->mt_canary.z = canary->z; + } + txn->mt_canary.v = txn->mt_txnid; + txn->mt_flags |= MDBX_TXN_DIRTY; + + return MDBX_SUCCESS; +} + +int mdbx_canary_get(const MDBX_txn *txn, MDBX_canary *canary) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(canary == NULL)) + return MDBX_EINVAL; + + *canary = txn->mt_canary; + return MDBX_SUCCESS; +} + +int mdbx_cursor_on_first(const MDBX_cursor *mc) { + if (unlikely(mc == NULL)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + if (!(mc->mc_flags & C_INITIALIZED)) + return mc->mc_db->md_entries ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE; + + for (unsigned i = 0; i < mc->mc_snum; ++i) { + if (mc->mc_ki[i]) + return MDBX_RESULT_FALSE; + } + + return MDBX_RESULT_TRUE; +} + +int mdbx_cursor_on_last(const MDBX_cursor *mc) { + if (unlikely(mc == NULL)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + if (!(mc->mc_flags & C_INITIALIZED)) + return mc->mc_db->md_entries ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE; + + for (unsigned i = 0; i < mc->mc_snum; ++i) { + unsigned nkeys = page_numkeys(mc->mc_pg[i]); + if (mc->mc_ki[i] < nkeys - 1) + return MDBX_RESULT_FALSE; + } + + return MDBX_RESULT_TRUE; +} + +int mdbx_cursor_eof(const MDBX_cursor *mc) { + if (unlikely(mc == NULL)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + return ((mc->mc_flags & (C_INITIALIZED | C_EOF)) == C_INITIALIZED && + mc->mc_snum && + mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top])) + ? MDBX_RESULT_FALSE + : MDBX_RESULT_TRUE; +} + +//------------------------------------------------------------------------------ + +struct diff_result { + ptrdiff_t diff; + unsigned level; + int root_nkeys; +}; + +/* calculates: r = x - y */ +__hot static int cursor_diff(const MDBX_cursor *const __restrict x, + const MDBX_cursor *const __restrict y, + struct diff_result *const __restrict r) { + r->diff = 0; + r->level = 0; + r->root_nkeys = 0; + + if (unlikely(x->mc_signature != MDBX_MC_LIVE)) + return (x->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + if (unlikely(y->mc_signature != MDBX_MC_LIVE)) + return (y->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + int rc = check_txn(x->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(x->mc_txn != y->mc_txn)) + return MDBX_BAD_TXN; + + if (unlikely(y->mc_dbi != x->mc_dbi)) + return MDBX_EINVAL; + + if (unlikely(!(y->mc_flags & x->mc_flags & C_INITIALIZED))) + return MDBX_ENODATA; + + while (likely(r->level < y->mc_snum && r->level < x->mc_snum)) { + if (unlikely(y->mc_pg[r->level] != x->mc_pg[r->level])) { + mdbx_error("Mismatch cursors's pages at %u level", r->level); + return MDBX_PROBLEM; + } + + int nkeys = page_numkeys(y->mc_pg[r->level]); + assert(nkeys > 0); + if (r->level == 0) + r->root_nkeys = nkeys; + + const int limit_ki = nkeys - 1; + const int x_ki = x->mc_ki[r->level]; + const int y_ki = y->mc_ki[r->level]; + r->diff = ((x_ki < limit_ki) ? x_ki : limit_ki) - + ((y_ki < limit_ki) ? y_ki : limit_ki); + if (r->diff == 0) { + r->level += 1; + continue; + } + + while (unlikely(r->diff == 1) && + likely(r->level + 1 < y->mc_snum && r->level + 1 < x->mc_snum)) { + r->level += 1; + /* DB'PAGEs: 0------------------>MAX + * + * CURSORs: y < x + * STACK[i ]: | + * STACK[+1]: ...y++N|0++x... + */ + nkeys = page_numkeys(y->mc_pg[r->level]); + r->diff = (nkeys - y->mc_ki[r->level]) + x->mc_ki[r->level]; + assert(r->diff > 0); + } + + while (unlikely(r->diff == -1) && + likely(r->level + 1 < y->mc_snum && r->level + 1 < x->mc_snum)) { + r->level += 1; + /* DB'PAGEs: 0------------------>MAX + * + * CURSORs: x < y + * STACK[i ]: | + * STACK[+1]: ...x--N|0--y... + */ + nkeys = page_numkeys(x->mc_pg[r->level]); + r->diff = -(nkeys - x->mc_ki[r->level]) - y->mc_ki[r->level]; + assert(r->diff < 0); + } + + return MDBX_SUCCESS; + } + + r->diff = CMP2INT(x->mc_flags & C_EOF, y->mc_flags & C_EOF); + return MDBX_SUCCESS; +} + +__hot static ptrdiff_t estimate(const MDBX_db *db, + struct diff_result *const __restrict dr) { + /* root: branch-page => scale = leaf-factor * branch-factor^(N-1) + * level-1: branch-page(s) => scale = leaf-factor * branch-factor^2 + * level-2: branch-page(s) => scale = leaf-factor * branch-factor + * level-N: branch-page(s) => scale = leaf-factor + * leaf-level: leaf-page(s) => scale = 1 + */ + ptrdiff_t btree_power = (ptrdiff_t)db->md_depth - 2 - (ptrdiff_t)dr->level; + if (btree_power < 0) + return dr->diff; + + ptrdiff_t estimated = + (ptrdiff_t)db->md_entries * dr->diff / (ptrdiff_t)db->md_leaf_pages; + if (btree_power == 0) + return estimated; + + if (db->md_depth < 4) { + assert(dr->level == 0 && btree_power == 1); + return (ptrdiff_t)db->md_entries * dr->diff / (ptrdiff_t)dr->root_nkeys; + } + + /* average_branchpage_fillfactor = total(branch_entries) / branch_pages + total(branch_entries) = leaf_pages + branch_pages - 1 (root page) */ + const size_t log2_fixedpoint = sizeof(size_t) - 1; + const size_t half = UINT64_C(1) << (log2_fixedpoint - 1); + const size_t factor = + ((db->md_leaf_pages + db->md_branch_pages - 1) << log2_fixedpoint) / + db->md_branch_pages; + while (1) { + switch ((size_t)btree_power) { + default: { + const size_t square = (factor * factor + half) >> log2_fixedpoint; + const size_t quad = (square * square + half) >> log2_fixedpoint; + do { + estimated = estimated * quad + half; + estimated >>= log2_fixedpoint; + btree_power -= 4; + } while (btree_power >= 4); + continue; + } + case 3: + estimated = estimated * factor + half; + estimated >>= log2_fixedpoint; + __fallthrough /* fall through */; + case 2: + estimated = estimated * factor + half; + estimated >>= log2_fixedpoint; + __fallthrough /* fall through */; + case 1: + estimated = estimated * factor + half; + estimated >>= log2_fixedpoint; + __fallthrough /* fall through */; + case 0: + if (unlikely(estimated > (ptrdiff_t)db->md_entries)) + return (ptrdiff_t)db->md_entries; + if (unlikely(estimated < -(ptrdiff_t)db->md_entries)) + return -(ptrdiff_t)db->md_entries; + return estimated; + } + } +} + +int mdbx_estimate_distance(const MDBX_cursor *first, const MDBX_cursor *last, + ptrdiff_t *distance_items) { + if (unlikely(first == NULL || last == NULL || distance_items == NULL)) + return MDBX_EINVAL; + + *distance_items = 0; + struct diff_result dr; + int rc = cursor_diff(last, first, &dr); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(dr.diff == 0) && + F_ISSET(first->mc_db->md_flags & last->mc_db->md_flags, + MDBX_DUPSORT | C_INITIALIZED)) { + first = &first->mc_xcursor->mx_cursor; + last = &last->mc_xcursor->mx_cursor; + rc = cursor_diff(first, last, &dr); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + + if (likely(dr.diff != 0)) + *distance_items = estimate(first->mc_db, &dr); + + return MDBX_SUCCESS; +} + +int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op move_op, ptrdiff_t *distance_items) { + if (unlikely(cursor == NULL || distance_items == NULL || + move_op == MDBX_GET_CURRENT || move_op == MDBX_GET_MULTIPLE)) + return MDBX_EINVAL; + + if (unlikely(cursor->mc_signature != MDBX_MC_LIVE)) + return (cursor->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + int rc = check_txn(cursor->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (!(cursor->mc_flags & C_INITIALIZED)) + return MDBX_ENODATA; + + MDBX_cursor_couple next; + cursor_copy(cursor, &next.outer); + if (cursor->mc_db->md_flags & MDBX_DUPSORT) { + next.outer.mc_xcursor = &next.inner; + rc = mdbx_xcursor_init0(&next.outer); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + MDBX_xcursor *mx = &container_of(cursor, MDBX_cursor_couple, outer)->inner; + cursor_copy(&mx->mx_cursor, &next.inner.mx_cursor); + } + + MDBX_val stub = {0, 0}; + if (data == NULL) { + const unsigned mask = + 1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | 1 << MDBX_SET_KEY; + if (unlikely(mask & (1 << move_op))) + return MDBX_EINVAL; + data = &stub; + } + + if (key == NULL) { + const unsigned mask = 1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | + 1 << MDBX_SET_KEY | 1 << MDBX_SET | + 1 << MDBX_SET_RANGE; + if (unlikely(mask & (1 << move_op))) + return MDBX_EINVAL; + key = &stub; + } + + next.outer.mc_signature = MDBX_MC_LIVE; + rc = mdbx_cursor_get(&next.outer, key, data, move_op); + if (unlikely(rc != MDBX_SUCCESS && + (rc != MDBX_NOTFOUND || !(next.outer.mc_flags & C_INITIALIZED)))) + return rc; + + return mdbx_estimate_distance(cursor, &next.outer, distance_items); +} + +int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, + MDBX_val *begin_data, MDBX_val *end_key, + MDBX_val *end_data, ptrdiff_t *size_items) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!size_items)) + return MDBX_EINVAL; + + if (unlikely(begin_data && (begin_key == NULL || begin_key == MDBX_EPSILON))) + return MDBX_EINVAL; + + if (unlikely(end_data && (end_key == NULL || end_key == MDBX_EPSILON))) + return MDBX_EINVAL; + + if (unlikely(begin_key == MDBX_EPSILON && end_key == MDBX_EPSILON)) + return MDBX_EINVAL; + + if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) + return MDBX_BAD_DBI; + + MDBX_cursor_couple begin; + /* LY: first, initialize cursor to refresh a DB in case it have DB_STALE */ + rc = mdbx_cursor_init(&begin.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(begin.outer.mc_db->md_entries == 0)) { + *size_items = 0; + return MDBX_SUCCESS; + } + + if (!begin_key) { + if (unlikely(!end_key)) { + /* LY: FIRST..LAST case */ + *size_items = (ptrdiff_t)begin.outer.mc_db->md_entries; + return MDBX_SUCCESS; + } + MDBX_val stub = {0, 0}; + rc = mdbx_cursor_first(&begin.outer, &stub, &stub); + if (unlikely(end_key == MDBX_EPSILON)) { + /* LY: FIRST..+epsilon case */ + return (rc == MDBX_SUCCESS) + ? mdbx_cursor_count(&begin.outer, (size_t *)size_items) + : rc; + } + } else { + if (unlikely(begin_key == MDBX_EPSILON)) { + if (end_key == NULL) { + /* LY: -epsilon..LAST case */ + MDBX_val stub = {0, 0}; + rc = mdbx_cursor_last(&begin.outer, &stub, &stub); + return (rc == MDBX_SUCCESS) + ? mdbx_cursor_count(&begin.outer, (size_t *)size_items) + : rc; + } + /* LY: -epsilon..value case */ + assert(end_key != MDBX_EPSILON); + begin_key = end_key; + } else if (unlikely(end_key == MDBX_EPSILON)) { + /* LY: value..+epsilon case */ + assert(begin_key != MDBX_EPSILON); + end_key = begin_key; + } + if (end_key && !begin_data && !end_data && + (begin_key == end_key || + begin.outer.mc_dbx->md_cmp(begin_key, end_key) == 0)) { + /* LY: single key case */ + rc = mdbx_cursor_set(&begin.outer, begin_key, NULL, MDBX_SET).err; + if (unlikely(rc != MDBX_SUCCESS)) { + *size_items = 0; + return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; + } + *size_items = 1; + if (begin.outer.mc_xcursor != NULL) { + MDBX_node *node = page_node(begin.outer.mc_pg[begin.outer.mc_top], + begin.outer.mc_ki[begin.outer.mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { + /* LY: return the number of duplicates for given key */ + mdbx_tassert(txn, + begin.outer.mc_xcursor == &begin.inner && + (begin.inner.mx_cursor.mc_flags & C_INITIALIZED)); + *size_items = + (sizeof(*size_items) >= sizeof(begin.inner.mx_db.md_entries) || + begin.inner.mx_db.md_entries <= PTRDIFF_MAX) + ? (size_t)begin.inner.mx_db.md_entries + : PTRDIFF_MAX; + } + } + return MDBX_SUCCESS; + } else { + rc = mdbx_cursor_set(&begin.outer, begin_key, begin_data, + begin_data ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE) + .err; + } + } + + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc != MDBX_NOTFOUND || !(begin.outer.mc_flags & C_INITIALIZED)) + return rc; + } + + MDBX_cursor_couple end; + rc = mdbx_cursor_init(&end.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + if (!end_key) { + MDBX_val stub = {0, 0}; + rc = mdbx_cursor_last(&end.outer, &stub, &stub); + } else { + rc = mdbx_cursor_set(&end.outer, end_key, end_data, + end_data ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE) + .err; + } + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc != MDBX_NOTFOUND || !(end.outer.mc_flags & C_INITIALIZED)) + return rc; + } + + rc = mdbx_estimate_distance(&begin.outer, &end.outer, size_items); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + assert(*size_items >= -(ptrdiff_t)begin.outer.mc_db->md_entries && + *size_items <= (ptrdiff_t)begin.outer.mc_db->md_entries); + +#if 0 /* LY: Was decided to returns as-is (i.e. negative) the estimation \ + * results for an inverted ranges. */ + + /* Commit 8ddfd1f34ad7cf7a3c4aa75d2e248ca7e639ed63 + Change-Id: If59eccf7311123ab6384c4b93f9b1fed5a0a10d1 */ + + if (*size_items < 0) { + /* LY: inverted range case */ + *size_items += (ptrdiff_t)begin.outer.mc_db->md_entries; + } else if (*size_items == 0 && begin_key && end_key) { + int cmp = begin.outer.mc_dbx->md_cmp(&origin_begin_key, &origin_end_key); + if (cmp == 0 && (begin.inner.mx_cursor.mc_flags & C_INITIALIZED) && + begin_data && end_data) + cmp = begin.outer.mc_dbx->md_dcmp(&origin_begin_data, &origin_end_data); + if (cmp > 0) { + /* LY: inverted range case with empty scope */ + *size_items = (ptrdiff_t)begin.outer.mc_db->md_entries; + } + } + assert(*size_items >= 0 && + *size_items <= (ptrdiff_t)begin.outer.mc_db->md_entries); +#endif + + return MDBX_SUCCESS; +} + +//------------------------------------------------------------------------------ + +/* Позволяет обновить или удалить существующую запись с получением + * в old_data предыдущего значения данных. При этом если new_data равен + * нулю, то выполняется удаление, иначе обновление/вставка. + * + * Текущее значение может находиться в уже измененной (грязной) странице. + * В этом случае страница будет перезаписана при обновлении, а само старое + * значение утрачено. Поэтому исходно в old_data должен быть передан + * дополнительный буфер для копирования старого значения. + * Если переданный буфер слишком мал, то функция вернет -1, установив + * old_data->iov_len в соответствующее значение. + * + * Для не-уникальных ключей также возможен второй сценарий использования, + * когда посредством old_data из записей с одинаковым ключом для + * удаления/обновления выбирается конкретная. Для выбора этого сценария + * во flags следует одновременно указать MDBX_CURRENT и MDBX_NOOVERWRITE. + * Именно эта комбинация выбрана, так как она лишена смысла, и этим позволяет + * идентифицировать запрос такого сценария. + * + * Функция может быть замещена соответствующими операциями с курсорами + * после двух доработок (TODO): + * - внешняя аллокация курсоров, в том числе на стеке (без malloc). + * - получения dirty-статуса страницы по адресу (знать о MUTABLE/WRITEABLE). + */ + +int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, + MDBX_val *new_data, MDBX_val *old_data, + MDBX_put_flags_t flags, MDBX_preserve_func preserver, + void *preserver_context) { + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!key || !old_data || old_data == new_data)) + return MDBX_EINVAL; + + if (unlikely(old_data->iov_base == NULL && old_data->iov_len)) + return MDBX_EINVAL; + + if (unlikely(new_data == NULL && + (flags & (MDBX_CURRENT | MDBX_RESERVE)) != MDBX_CURRENT)) + return MDBX_EINVAL; + + if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) + return MDBX_BAD_DBI; + + if (unlikely(flags & + ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS | + MDBX_RESERVE | MDBX_APPEND | MDBX_APPENDDUP | MDBX_CURRENT))) + return MDBX_EINVAL; + + MDBX_cursor_couple cx; + rc = mdbx_cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + cx.outer.mc_next = txn->tw.cursors[dbi]; + txn->tw.cursors[dbi] = &cx.outer; + + MDBX_val present_key = *key; + if (F_ISSET(flags, MDBX_CURRENT | MDBX_NOOVERWRITE)) { + /* в old_data значение для выбора конкретного дубликата */ + if (unlikely(!(txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT))) { + rc = MDBX_EINVAL; + goto bailout; + } + + /* убираем лишний бит, он был признаком запрошенного режима */ + flags -= MDBX_NOOVERWRITE; + + rc = mdbx_cursor_get(&cx.outer, &present_key, old_data, MDBX_GET_BOTH); + if (rc != MDBX_SUCCESS) + goto bailout; + } else { + /* в old_data буфер для сохранения предыдущего значения */ + if (unlikely(new_data && old_data->iov_base == new_data->iov_base)) + return MDBX_EINVAL; + MDBX_val present_data; + rc = mdbx_cursor_get(&cx.outer, &present_key, &present_data, MDBX_SET_KEY); + if (unlikely(rc != MDBX_SUCCESS)) { + old_data->iov_base = NULL; + old_data->iov_len = 0; + if (rc != MDBX_NOTFOUND || (flags & MDBX_CURRENT)) + goto bailout; + } else if (flags & MDBX_NOOVERWRITE) { + rc = MDBX_KEYEXIST; + *old_data = present_data; + goto bailout; + } else { + MDBX_page *page = cx.outer.mc_pg[cx.outer.mc_top]; + if (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) { + if (flags & MDBX_CURRENT) { + /* disallow update/delete for multi-values */ + MDBX_node *node = page_node(page, cx.outer.mc_ki[cx.outer.mc_top]); + if (F_ISSET(node_flags(node), F_DUPDATA)) { + mdbx_tassert(txn, XCURSOR_INITED(&cx.outer) && + cx.outer.mc_xcursor->mx_db.md_entries > 1); + if (cx.outer.mc_xcursor->mx_db.md_entries > 1) { + rc = MDBX_EMULTIVAL; + goto bailout; + } + } + /* В оригинальной LMDB флажок MDBX_CURRENT здесь приведет + * к замене данных без учета MDBX_DUPSORT сортировки, + * но здесь это в любом случае допустимо, так как мы + * проверили что для ключа есть только одно значение. */ + } + } + + if (IS_MODIFIABLE(txn, page)) { + if (new_data && cmp_lenfast(&present_data, new_data) == 0) { + /* если данные совпадают, то ничего делать не надо */ + *old_data = *new_data; + goto bailout; + } + rc = preserver ? preserver(preserver_context, old_data, + present_data.iov_base, present_data.iov_len) + : MDBX_SUCCESS; + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } else { + *old_data = present_data; + } + flags |= MDBX_CURRENT; + } + } + + if (likely(new_data)) + rc = mdbx_cursor_put(&cx.outer, key, new_data, flags); + else + rc = mdbx_cursor_del(&cx.outer, flags & MDBX_ALLDUPS); + +bailout: + txn->tw.cursors[dbi] = cx.outer.mc_next; + return rc; +} + +static int default_value_preserver(void *context, MDBX_val *target, + const void *src, size_t bytes) { + (void)context; + if (unlikely(target->iov_len < bytes)) { + target->iov_base = nullptr; + target->iov_len = bytes; + return MDBX_RESULT_TRUE; + } + memcpy(target->iov_base, src, target->iov_len = bytes); + return MDBX_SUCCESS; +} + +int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, + MDBX_val *new_data, MDBX_val *old_data, + MDBX_put_flags_t flags) { + return mdbx_replace_ex(txn, dbi, key, new_data, old_data, flags, + default_value_preserver, nullptr); +} + +/* Функция сообщает находится ли указанный адрес в "грязной" странице у + * заданной пишущей транзакции. В конечном счете это позволяет избавиться от + * лишнего копирования данных из НЕ-грязных страниц. + * + * "Грязные" страницы - это те, которые уже были изменены в ходе пишущей + * транзакции. Соответственно, какие-либо дальнейшие изменения могут привести + * к перезаписи таких страниц. Поэтому все функции, выполняющие изменения, в + * качестве аргументов НЕ должны получать указатели на данные в таких + * страницах. В свою очередь "НЕ грязные" страницы перед модификацией будут + * скопированы. + * + * Другими словами, данные из "грязных" страниц должны быть либо скопированы + * перед передачей в качестве аргументов для дальнейших модификаций, либо + * отвергнуты на стадии проверки корректности аргументов. + * + * Таким образом, функция позволяет как избавится от лишнего копирования, + * так и выполнить более полную проверку аргументов. + * + * ВАЖНО: Передаваемый указатель должен указывать на начало данных. Только + * так гарантируется что актуальный заголовок страницы будет физически + * расположен в той-же странице памяти, в том числе для многостраничных + * P_OVERFLOW страниц с длинными данными. */ +int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + const MDBX_env *env = txn->mt_env; + const ptrdiff_t offset = (uint8_t *)ptr - env->me_map; + if (offset >= 0) { + const pgno_t pgno = bytes2pgno(env, offset); + if (likely(pgno < txn->mt_next_pgno)) { + const MDBX_page *page = pgno2page(env, pgno); + if (unlikely(page->mp_pgno != pgno || + (page->mp_flags & P_ILL_BITS) != 0)) { + /* The ptr pointed into middle of a large page, + * not to the beginning of a data. */ + return MDBX_EINVAL; + } + return ((txn->mt_flags & MDBX_TXN_RDONLY) || !IS_MODIFIABLE(txn, page)) + ? MDBX_RESULT_FALSE + : MDBX_RESULT_TRUE; + } + if ((size_t)offset < env->me_dxb_mmap.limit) { + /* Указатель адресует что-то в пределах mmap, но за границей + * распределенных страниц. Такое может случится если mdbx_is_dirty() + * вызывается после операции, в ходе которой грязная страница была + * возвращена в нераспределенное пространство. */ + return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EINVAL : MDBX_RESULT_TRUE; + } + } + + /* Страница вне используемого mmap-диапазона, т.е. либо в функцию был + * передан некорректный адрес, либо адрес в теневой странице, которая была + * выделена посредством malloc(). + * + * Для режима MDBX_WRITE_MAP режима страница однозначно "не грязная", + * а для режимов без MDBX_WRITE_MAP однозначно "не чистая". */ + return (txn->mt_flags & (MDBX_WRITEMAP | MDBX_TXN_RDONLY)) ? MDBX_EINVAL + : MDBX_RESULT_TRUE; +} + +int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, + uint64_t increment) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) + return MDBX_BAD_DBI; + + if (unlikely(txn->mt_dbistate[dbi] & DBI_STALE)) { + rc = mdbx_fetch_sdb(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + + MDBX_db *dbs = &txn->mt_dbs[dbi]; + if (likely(result)) + *result = dbs->md_seq; + + if (likely(increment > 0)) { + if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) + return MDBX_EACCESS; + + uint64_t new = dbs->md_seq + increment; + if (unlikely(new < increment)) + return MDBX_RESULT_TRUE; + + mdbx_tassert(txn, new > dbs->md_seq); + dbs->md_seq = new; + txn->mt_flags |= MDBX_TXN_DIRTY; + txn->mt_dbistate[dbi] |= DBI_DIRTY; + } + + return MDBX_SUCCESS; +} + +/*----------------------------------------------------------------------------*/ + +#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API +__cold MDBX_NOTHROW_CONST_FUNCTION intptr_t mdbx_limits_pgsize_min(void) { + return __inline_mdbx_limits_pgsize_min(); +} + +__cold MDBX_NOTHROW_CONST_FUNCTION intptr_t mdbx_limits_pgsize_max(void) { + return __inline_mdbx_limits_pgsize_max(); +} +#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ + +__cold intptr_t mdbx_limits_dbsize_min(intptr_t pagesize) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_default_pagesize(); + else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || + pagesize > (intptr_t)MAX_PAGESIZE || + !is_powerof2((size_t)pagesize))) + return -1; + + return MIN_PAGENO * pagesize; +} + +__cold intptr_t mdbx_limits_dbsize_max(intptr_t pagesize) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_default_pagesize(); + else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || + pagesize > (intptr_t)MAX_PAGESIZE || + !is_powerof2((size_t)pagesize))) + return -1; + + STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX); + const uint64_t limit = (1 + (uint64_t)MAX_PAGENO) * pagesize; + return (limit < (intptr_t)MAX_MAPSIZE) ? (intptr_t)limit + : (intptr_t)MAX_MAPSIZE; +} + +__cold intptr_t mdbx_limits_txnsize_max(intptr_t pagesize) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_default_pagesize(); + else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || + pagesize > (intptr_t)MAX_PAGESIZE || + !is_powerof2((size_t)pagesize))) + return -1; + + STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX); + const uint64_t pgl_limit = + pagesize * (uint64_t)(MDBX_PGL_LIMIT / 1.6180339887498948482); + const uint64_t map_limit = (uint64_t)(MAX_MAPSIZE / 1.6180339887498948482); + return (pgl_limit < map_limit) ? (intptr_t)pgl_limit : (intptr_t)map_limit; +} + +/*** Key-making functions to avoid custom comparators *************************/ + +static __always_inline double key2double(const int64_t key) { + union { + uint64_t u; + double f; + } casting; + + casting.u = (key < 0) ? key + UINT64_C(0x8000000000000000) + : UINT64_C(0xffffFFFFffffFFFF) - key; + return casting.f; +} + +static __always_inline uint64_t double2key(const double *const ptr) { + STATIC_ASSERT(sizeof(double) == sizeof(int64_t)); + const int64_t i = *(const int64_t *)ptr; + const uint64_t u = (i < 0) ? UINT64_C(0xffffFFFFffffFFFF) - i + : i + UINT64_C(0x8000000000000000); + if (mdbx_assert_enabled()) { + const double f = key2double(u); + assert(memcmp(&f, ptr, 8) == 0); + } + return u; +} + +static __always_inline float key2float(const int32_t key) { + union { + uint32_t u; + float f; + } casting; + + casting.u = + (key < 0) ? key + UINT32_C(0x80000000) : UINT32_C(0xffffFFFF) - key; + return casting.f; +} + +static __always_inline uint32_t float2key(const float *const ptr) { + STATIC_ASSERT(sizeof(float) == sizeof(int32_t)); + const int32_t i = *(const int32_t *)ptr; + const uint32_t u = + (i < 0) ? UINT32_C(0xffffFFFF) - i : i + UINT32_C(0x80000000); + if (mdbx_assert_enabled()) { + const float f = key2float(u); + assert(memcmp(&f, ptr, 4) == 0); + } + return u; +} + +uint64_t mdbx_key_from_double(const double ieee754_64bit) { + return double2key(&ieee754_64bit); +} + +uint64_t mdbx_key_from_ptrdouble(const double *const ieee754_64bit) { + return double2key(ieee754_64bit); +} + +uint32_t mdbx_key_from_float(const float ieee754_32bit) { + return float2key(&ieee754_32bit); +} + +uint32_t mdbx_key_from_ptrfloat(const float *const ieee754_32bit) { + return float2key(ieee754_32bit); +} + +#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API +MDBX_NOTHROW_CONST_FUNCTION uint64_t mdbx_key_from_int64(const int64_t i64) { + return __inline_mdbx_key_from_int64(i64); +} + +MDBX_NOTHROW_CONST_FUNCTION uint32_t mdbx_key_from_int32(const int32_t i32) { + return __inline_mdbx_key_from_int32(i32); +} +#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ + +#define IEEE754_DOUBLE_MANTISSA_SIZE 52 +#define IEEE754_DOUBLE_EXPONENTA_BIAS 0x3FF +#define IEEE754_DOUBLE_EXPONENTA_MAX 0x7FF +#define IEEE754_DOUBLE_IMPLICIT_LEAD UINT64_C(0x0010000000000000) +#define IEEE754_DOUBLE_MANTISSA_MASK UINT64_C(0x000FFFFFFFFFFFFF) +#define IEEE754_DOUBLE_MANTISSA_AMAX UINT64_C(0x001FFFFFFFFFFFFF) + +static __inline int clz64(uint64_t value) { +#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_clzl) + if (sizeof(value) == sizeof(int)) + return __builtin_clz(value); + if (sizeof(value) == sizeof(long)) + return __builtin_clzl(value); +#if (defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ == 8) || \ + __has_builtin(__builtin_clzll) + return __builtin_clzll(value); +#endif /* have(long long) && long long == uint64_t */ +#endif /* GNU C */ + +#if defined(_MSC_VER) + unsigned long index; +#if defined(_M_AMD64) || defined(_M_ARM64) || defined(_M_X64) + _BitScanReverse64(&index, value); + return 63 - index; +#else + if (value > UINT32_MAX) { + _BitScanReverse(&index, (uint32_t)(value >> 32)); + return 31 - index; + } + _BitScanReverse(&index, (uint32_t)value); + return 63 - index; +#endif +#endif /* MSVC */ + + value |= value >> 1; + value |= value >> 2; + value |= value >> 4; + value |= value >> 8; + value |= value >> 16; + value |= value >> 32; + static const uint8_t debruijn_clz64[64] = { + 63, 16, 62, 7, 15, 36, 61, 3, 6, 14, 22, 26, 35, 47, 60, 2, + 9, 5, 28, 11, 13, 21, 42, 19, 25, 31, 34, 40, 46, 52, 59, 1, + 17, 8, 37, 4, 23, 27, 48, 10, 29, 12, 43, 20, 32, 41, 53, 18, + 38, 24, 49, 30, 44, 33, 54, 39, 50, 45, 55, 51, 56, 57, 58, 0}; + return debruijn_clz64[value * UINT64_C(0x03F79D71B4CB0A89) >> 58]; +} + +static __inline uint64_t round_mantissa(const uint64_t u64, int shift) { + assert(shift < 0 && u64 > 0); + shift = -shift; + const unsigned half = 1 << (shift - 1); + const unsigned lsb = 1 & (unsigned)(u64 >> shift); + const unsigned tie2even = 1 ^ lsb; + return (u64 + half - tie2even) >> shift; +} + +uint64_t mdbx_key_from_jsonInteger(const int64_t json_integer) { + const uint64_t bias = UINT64_C(0x8000000000000000); + if (json_integer > 0) { + const uint64_t u64 = json_integer; + int shift = clz64(u64) - (64 - IEEE754_DOUBLE_MANTISSA_SIZE - 1); + uint64_t mantissa = u64 << shift; + if (unlikely(shift < 0)) { + mantissa = round_mantissa(u64, shift); + if (mantissa > IEEE754_DOUBLE_MANTISSA_AMAX) + mantissa = round_mantissa(u64, --shift); + } + + assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD && + mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX); + const uint64_t exponent = + IEEE754_DOUBLE_EXPONENTA_BIAS + IEEE754_DOUBLE_MANTISSA_SIZE - shift; + assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX); + const uint64_t key = bias + (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) + + (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD); +#if !defined(_MSC_VER) || \ + defined( \ + _DEBUG) /* Workaround for MSVC error LNK2019: unresolved external \ + symbol __except1 referenced in function __ftol3_except */ + assert(key == mdbx_key_from_double((double)json_integer)); +#endif /* Workaround for MSVC */ + return key; + } + + if (json_integer < 0) { + const uint64_t u64 = -json_integer; + int shift = clz64(u64) - (64 - IEEE754_DOUBLE_MANTISSA_SIZE - 1); + uint64_t mantissa = u64 << shift; + if (unlikely(shift < 0)) { + mantissa = round_mantissa(u64, shift); + if (mantissa > IEEE754_DOUBLE_MANTISSA_AMAX) + mantissa = round_mantissa(u64, --shift); + } + + assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD && + mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX); + const uint64_t exponent = + IEEE754_DOUBLE_EXPONENTA_BIAS + IEEE754_DOUBLE_MANTISSA_SIZE - shift; + assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX); + const uint64_t key = bias - 1 - (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) - + (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD); +#if !defined(_MSC_VER) || \ + defined( \ + _DEBUG) /* Workaround for MSVC error LNK2019: unresolved external \ + symbol __except1 referenced in function __ftol3_except */ + assert(key == mdbx_key_from_double((double)json_integer)); +#endif /* Workaround for MSVC */ + return key; + } + + return bias; +} + +int64_t mdbx_jsonInteger_from_key(const MDBX_val v) { + assert(v.iov_len == 8); + const uint64_t key = unaligned_peek_u64(2, v.iov_base); + const uint64_t bias = UINT64_C(0x8000000000000000); + const uint64_t covalent = (key > bias) ? key - bias : bias - key - 1; + const int shift = IEEE754_DOUBLE_EXPONENTA_BIAS + 63 - + (IEEE754_DOUBLE_EXPONENTA_MAX & + (int)(covalent >> IEEE754_DOUBLE_MANTISSA_SIZE)); + if (unlikely(shift < 1)) + return (key < bias) ? INT64_MIN : INT64_MAX; + if (unlikely(shift > 63)) + return 0; + + const uint64_t unscaled = ((covalent & IEEE754_DOUBLE_MANTISSA_MASK) + << (63 - IEEE754_DOUBLE_MANTISSA_SIZE)) + + bias; + const int64_t absolute = unscaled >> shift; + const int64_t value = (key < bias) ? -absolute : absolute; + assert(key == mdbx_key_from_jsonInteger(value) || + (mdbx_key_from_jsonInteger(value - 1) < key && + key < mdbx_key_from_jsonInteger(value + 1))); + return value; +} + +double mdbx_double_from_key(const MDBX_val v) { + assert(v.iov_len == 8); + return key2double(unaligned_peek_u64(2, v.iov_base)); +} + +float mdbx_float_from_key(const MDBX_val v) { + assert(v.iov_len == 4); + return key2float(unaligned_peek_u32(2, v.iov_base)); +} + +int32_t mdbx_int32_from_key(const MDBX_val v) { + assert(v.iov_len == 4); + return (int32_t)(unaligned_peek_u32(2, v.iov_base) - UINT32_C(0x80000000)); +} + +int64_t mdbx_int64_from_key(const MDBX_val v) { + assert(v.iov_len == 8); + return (int64_t)(unaligned_peek_u64(2, v.iov_base) - + UINT64_C(0x8000000000000000)); +} + +__cold MDBX_cmp_func *mdbx_get_keycmp(unsigned flags) { + return get_default_keycmp(flags); +} + +__cold MDBX_cmp_func *mdbx_get_datacmp(unsigned flags) { + return get_default_datacmp(flags); +} + +__cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, + const uint64_t value) { + int err = check_env(env, false); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + const bool lock_needed = ((env->me_flags & MDBX_ENV_ACTIVE) && env->me_txn0 && + env->me_txn0->mt_owner != mdbx_thread_self()); + bool should_unlock = false; + switch (option) { + case MDBX_opt_sync_bytes: + if (unlikely(env->me_flags & MDBX_RDONLY)) + return MDBX_EACCESS; + if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) + return MDBX_EPERM; + if (sizeof(value) > sizeof(size_t) && unlikely(value != (size_t)value)) + return MDBX_TOO_LARGE; + if (atomic_store32(&env->me_lck->mti_autosync_threshold, + bytes2pgno(env, (size_t)value + env->me_psize - 1), + mo_Relaxed) != 0 && + (env->me_flags & MDBX_ENV_ACTIVE)) { + err = mdbx_env_sync_poll(env); + if (unlikely(MDBX_IS_ERROR(err))) + return err; + } + break; + + case MDBX_opt_sync_period: + if (unlikely(env->me_flags & MDBX_RDONLY)) + return MDBX_EACCESS; + if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) + return MDBX_EPERM; + if (unlikely(value > UINT32_MAX)) + return MDBX_TOO_LARGE; + if (atomic_store64(&env->me_lck->mti_autosync_period, + mdbx_osal_16dot16_to_monotime((uint32_t)value), + mo_Relaxed) != 0 && + (env->me_flags & MDBX_ENV_ACTIVE)) { + err = mdbx_env_sync_poll(env); + if (unlikely(MDBX_IS_ERROR(err))) + return err; + } + break; + + case MDBX_opt_max_db: + if (unlikely(value > MDBX_MAX_DBI)) + return MDBX_EINVAL; + if (unlikely(env->me_map)) + return MDBX_EPERM; + env->me_maxdbs = (unsigned)value + CORE_DBS; + break; + + case MDBX_opt_max_readers: + if (unlikely(value < 1 || value > MDBX_READERS_LIMIT)) + return MDBX_EINVAL; + if (unlikely(env->me_map)) + return MDBX_EPERM; + env->me_maxreaders = (unsigned)value; + break; + + case MDBX_opt_dp_reserve_limit: + if (unlikely(value > INT_MAX)) + return MDBX_EINVAL; + if (env->me_options.dp_reserve_limit != (unsigned)value) { + if (lock_needed) { + err = mdbx_txn_lock(env, false); + if (unlikely(err != MDBX_SUCCESS)) + return err; + should_unlock = true; + } + env->me_options.dp_reserve_limit = (unsigned)value; + while (env->me_dp_reserve_len > env->me_options.dp_reserve_limit) { + mdbx_assert(env, env->me_dp_reserve != NULL); + MDBX_page *dp = env->me_dp_reserve; + ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); + VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); + env->me_dp_reserve = dp->mp_next; + VALGRIND_MEMPOOL_FREE(env, dp); + mdbx_free(dp); + env->me_dp_reserve_len -= 1; + } + } + break; + + case MDBX_opt_rp_augment_limit: + if (unlikely(value > MDBX_PGL_LIMIT)) + return MDBX_EINVAL; + env->me_options.rp_augment_limit = (unsigned)value; + break; + + case MDBX_opt_txn_dp_limit: + case MDBX_opt_txn_dp_initial: + if (unlikely(value > MDBX_PGL_LIMIT || value < CURSOR_STACK * 4)) + return MDBX_EINVAL; + if (unlikely(env->me_flags & MDBX_RDONLY)) + return MDBX_EACCESS; + if (lock_needed) { + err = mdbx_txn_lock(env, false); + if (unlikely(err != MDBX_SUCCESS)) + return err; + should_unlock = true; + } + if (env->me_txn) + err = MDBX_EPERM /* unable change during transaction */; + else { + const pgno_t value32 = (pgno_t)value; + if (option == MDBX_opt_txn_dp_initial && + env->me_options.dp_initial != value32) { + env->me_options.dp_initial = value32; + if (env->me_options.dp_limit < value32) { + env->me_options.dp_limit = value32; + env->me_options.flags.non_auto.dp_limit = 1; + } + } + if (option == MDBX_opt_txn_dp_limit && + env->me_options.dp_limit != value32) { + env->me_options.dp_limit = value32; + env->me_options.flags.non_auto.dp_limit = 1; + if (env->me_options.dp_initial > value32) + env->me_options.dp_initial = value32; + } + } + break; + + case MDBX_opt_spill_max_denominator: + if (unlikely(value > 255)) + return MDBX_EINVAL; + env->me_options.spill_max_denominator = (uint8_t)value; + break; + case MDBX_opt_spill_min_denominator: + if (unlikely(value > 255)) + return MDBX_EINVAL; + env->me_options.spill_min_denominator = (uint8_t)value; + break; + case MDBX_opt_spill_parent4child_denominator: + if (unlikely(value > 255)) + return MDBX_EINVAL; + env->me_options.spill_parent4child_denominator = (uint8_t)value; + break; + + case MDBX_opt_loose_limit: + if (unlikely(value > 255)) + return MDBX_EINVAL; + env->me_options.dp_loose_limit = (uint8_t)value; + break; + + case MDBX_opt_merge_threshold_16dot16_percent: + if (unlikely(value < 8192 || value > 32768)) + return MDBX_EINVAL; + env->me_options.merge_threshold_16dot16_percent = (unsigned)value; + recalculate_merge_threshold(env); + break; + + default: + return MDBX_EINVAL; + } + + if (should_unlock) + mdbx_txn_unlock(env); + return err; +} + +__cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, + uint64_t *pvalue) { + int err = check_env(env, false); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (unlikely(!pvalue)) + return MDBX_EINVAL; + + switch (option) { + case MDBX_opt_sync_bytes: + if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) + return MDBX_EPERM; + *pvalue = pgno2bytes( + env, atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed)); + break; + + case MDBX_opt_sync_period: + if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) + return MDBX_EPERM; + *pvalue = mdbx_osal_monotime_to_16dot16( + atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed)); + break; + + case MDBX_opt_max_db: + *pvalue = env->me_maxdbs - CORE_DBS; + break; + + case MDBX_opt_max_readers: + *pvalue = env->me_maxreaders; + break; + + case MDBX_opt_dp_reserve_limit: + *pvalue = env->me_options.dp_reserve_limit; + break; + + case MDBX_opt_rp_augment_limit: + *pvalue = env->me_options.rp_augment_limit; + break; + + case MDBX_opt_txn_dp_limit: + *pvalue = env->me_options.dp_limit; + break; + case MDBX_opt_txn_dp_initial: + *pvalue = env->me_options.dp_initial; + break; + + case MDBX_opt_spill_max_denominator: + *pvalue = env->me_options.spill_max_denominator; + break; + case MDBX_opt_spill_min_denominator: + *pvalue = env->me_options.spill_min_denominator; + break; + case MDBX_opt_spill_parent4child_denominator: + *pvalue = env->me_options.spill_parent4child_denominator; + break; + + case MDBX_opt_loose_limit: + *pvalue = env->me_options.dp_loose_limit; + break; + + case MDBX_opt_merge_threshold_16dot16_percent: + *pvalue = env->me_options.merge_threshold_16dot16_percent; + break; + + default: + return MDBX_EINVAL; + } + + return MDBX_SUCCESS; +} + +/*** Attribute support functions for Nexenta **********************************/ +#ifdef MDBX_NEXENTA_ATTRS + +static __inline int mdbx_attr_peek(MDBX_val *data, mdbx_attr_t *attrptr) { + if (unlikely(data->iov_len < sizeof(mdbx_attr_t))) + return MDBX_INCOMPATIBLE; + + if (likely(attrptr != NULL)) + *attrptr = *(mdbx_attr_t *)data->iov_base; + data->iov_len -= sizeof(mdbx_attr_t); + data->iov_base = + likely(data->iov_len > 0) ? ((mdbx_attr_t *)data->iov_base) + 1 : NULL; + + return MDBX_SUCCESS; +} + +static __inline int mdbx_attr_poke(MDBX_val *reserved, MDBX_val *data, + mdbx_attr_t attr, MDBX_put_flags_t flags) { + mdbx_attr_t *space = reserved->iov_base; + if (flags & MDBX_RESERVE) { + if (likely(data != NULL)) { + data->iov_base = data->iov_len ? space + 1 : NULL; + } + } else { + *space = attr; + if (likely(data != NULL)) { + memcpy(space + 1, data->iov_base, data->iov_len); + } + } + + return MDBX_SUCCESS; +} + +int mdbx_cursor_get_attr(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + mdbx_attr_t *attrptr, MDBX_cursor_op op) { + int rc = mdbx_cursor_get(mc, key, data, op); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + return mdbx_attr_peek(data, attrptr); +} + +int mdbx_get_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, + uint64_t *attrptr) { + int rc = mdbx_get(txn, dbi, key, data); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + return mdbx_attr_peek(data, attrptr); +} + +int mdbx_put_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, + mdbx_attr_t attr, MDBX_put_flags_t flags) { + MDBX_val reserve; + reserve.iov_base = NULL; + reserve.iov_len = (data ? data->iov_len : 0) + sizeof(mdbx_attr_t); + + int rc = mdbx_put(txn, dbi, key, &reserve, flags | MDBX_RESERVE); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + return mdbx_attr_poke(&reserve, data, attr, flags); +} + +int mdbx_cursor_put_attr(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, + mdbx_attr_t attr, MDBX_put_flags_t flags) { + MDBX_val reserve; + reserve.iov_base = NULL; + reserve.iov_len = (data ? data->iov_len : 0) + sizeof(mdbx_attr_t); + + int rc = mdbx_cursor_put(cursor, key, &reserve, flags | MDBX_RESERVE); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + return mdbx_attr_poke(&reserve, data, attr, flags); +} + +int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, + mdbx_attr_t attr) { + if (unlikely(!key || !txn)) + return MDBX_EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDBX_EBADSIGN; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return MDBX_BAD_DBI; + + if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) + return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; + + MDBX_cursor_couple cx; + MDBX_val old_data; + int rc = mdbx_cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + rc = mdbx_cursor_set(&cx.outer, key, &old_data, MDBX_SET, NULL); + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc == MDBX_NOTFOUND && data) { + cx.outer.mc_next = txn->tw.cursors[dbi]; + txn->tw.cursors[dbi] = &cx.outer; + rc = mdbx_cursor_put_attr(&cx.outer, key, data, attr, 0); + txn->tw.cursors[dbi] = cx.outer.mc_next; + } + return rc; + } + + mdbx_attr_t old_attr = 0; + rc = mdbx_attr_peek(&old_data, &old_attr); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (old_attr == attr && (!data || (data->iov_len == old_data.iov_len && + memcmp(data->iov_base, old_data.iov_base, + old_data.iov_len) == 0))) + return MDBX_SUCCESS; + + cx.outer.mc_next = txn->tw.cursors[dbi]; + txn->tw.cursors[dbi] = &cx.outer; + rc = mdbx_cursor_put_attr(&cx.outer, key, data ? data : &old_data, attr, + MDBX_CURRENT); + txn->tw.cursors[dbi] = cx.outer.mc_next; + return rc; +} +#endif /* MDBX_NEXENTA_ATTRS */ + +/******************************************************************************/ +/* *INDENT-OFF* */ +/* clang-format off */ + +__dll_export +#ifdef __attribute_used__ + __attribute_used__ +#elif defined(__GNUC__) || __has_attribute(__used__) + __attribute__((__used__)) +#endif +#ifdef __attribute_externally_visible__ + __attribute_externally_visible__ +#elif (defined(__GNUC__) && !defined(__clang__)) || \ + __has_attribute(__externally_visible__) + __attribute__((__externally_visible__)) +#endif + const struct MDBX_build_info mdbx_build = { +#ifdef MDBX_BUILD_TIMESTAMP + MDBX_BUILD_TIMESTAMP +#else + "\"" __DATE__ " " __TIME__ "\"" +#endif /* MDBX_BUILD_TIMESTAMP */ + + , +#ifdef MDBX_BUILD_TARGET + MDBX_BUILD_TARGET +#else + #if defined(__ANDROID_API__) + "Android" STRINGIFY(__ANDROID_API__) + #elif defined(__linux__) || defined(__gnu_linux__) + "Linux" + #elif defined(EMSCRIPTEN) || defined(__EMSCRIPTEN__) + "webassembly" + #elif defined(__CYGWIN__) + "CYGWIN" + #elif defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) \ + || defined(__WINDOWS__) + "Windows" + #elif defined(__APPLE__) + #if (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE) \ + || (defined(TARGET_IPHONE_SIMULATOR) && TARGET_IPHONE_SIMULATOR) + "iOS" + #else + "MacOS" + #endif + #elif defined(__FreeBSD__) + "FreeBSD" + #elif defined(__DragonFly__) + "DragonFlyBSD" + #elif defined(__NetBSD__) + "NetBSD" + #elif defined(__OpenBSD__) + "OpenBSD" + #elif defined(__bsdi__) + "UnixBSDI" + #elif defined(__MACH__) + "MACH" + #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) + "HPUX" + #elif defined(_AIX) + "AIX" + #elif defined(__sun) && defined(__SVR4) + "Solaris" + #elif defined(__BSD__) || defined(BSD) + "UnixBSD" + #elif defined(__unix__) || defined(UNIX) || defined(__unix) \ + || defined(__UNIX) || defined(__UNIX__) + "UNIX" + #elif defined(_POSIX_VERSION) + "POSIX" STRINGIFY(_POSIX_VERSION) + #else + "UnknownOS" + #endif /* Target OS */ + + "-" + + #if defined(__amd64__) + "AMD64" + #elif defined(__ia32__) + "IA32" + #elif defined(__e2k__) || defined(__elbrus__) + "Elbrus" + #elif defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) + "Alpha" + #elif defined(__aarch64__) || defined(_M_ARM64) + "ARM64" + #elif defined(__arm__) || defined(__thumb__) || defined(__TARGET_ARCH_ARM) \ + || defined(__TARGET_ARCH_THUMB) || defined(_ARM) || defined(_M_ARM) \ + || defined(_M_ARMT) || defined(__arm) + "ARM" + #elif defined(__mips64) || defined(__mips64__) || (defined(__mips) && (__mips >= 64)) + "MIPS64" + #elif defined(__mips__) || defined(__mips) || defined(_R4000) || defined(__MIPS__) + "MIPS" + #elif defined(__hppa64__) || defined(__HPPA64__) || defined(__hppa64) + "PARISC64" + #elif defined(__hppa__) || defined(__HPPA__) || defined(__hppa) + "PARISC" + #elif defined(__ia64__) || defined(__ia64) || defined(_IA64) \ + || defined(__IA64__) || defined(_M_IA64) || defined(__itanium__) + "Itanium" + #elif defined(__powerpc64__) || defined(__ppc64__) || defined(__ppc64) \ + || defined(__powerpc64) || defined(_ARCH_PPC64) + "PowerPC64" + #elif defined(__powerpc__) || defined(__ppc__) || defined(__powerpc) \ + || defined(__ppc) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__POWERPC__) + "PowerPC" + #elif defined(__sparc64__) || defined(__sparc64) + "SPARC64" + #elif defined(__sparc__) || defined(__sparc) + "SPARC" + #elif defined(__s390__) || defined(__s390) || defined(__zarch__) || defined(__zarch) + "S390" + #else + "UnknownARCH" + #endif +#endif /* MDBX_BUILD_TARGET */ + +#ifdef MDBX_BUILD_TYPE +# if defined(_MSC_VER) +# pragma message("Configuration-depended MDBX_BUILD_TYPE: " MDBX_BUILD_TYPE) +# endif + "-" MDBX_BUILD_TYPE +#endif /* MDBX_BUILD_TYPE */ + , + "MDBX_DEBUG=" STRINGIFY(MDBX_DEBUG) + " MDBX_WORDBITS=" STRINGIFY(MDBX_WORDBITS) + " BYTE_ORDER=" +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + "LITTLE_ENDIAN" +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + "BIG_ENDIAN" +#else + #error "FIXME: Unsupported byte order" +#endif /* __BYTE_ORDER__ */ + " MDBX_ENV_CHECKPID=" MDBX_ENV_CHECKPID_CONFIG + " MDBX_TXN_CHECKOWNER=" MDBX_TXN_CHECKOWNER_CONFIG + " MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG + " MDBX_64BIT_CAS=" MDBX_64BIT_CAS_CONFIG + " MDBX_TRUST_RTC=" MDBX_TRUST_RTC_CONFIG + " MDBX_ENABLE_REFUND=" STRINGIFY(MDBX_ENABLE_REFUND) + " MDBX_ENABLE_MADVISE=" STRINGIFY(MDBX_ENABLE_MADVISE) +#if MDBX_DISABLE_PAGECHECKS + " MDBX_DISABLE_PAGECHECKS=YES" +#endif /* MDBX_DISABLE_PAGECHECKS */ +#ifdef __SANITIZE_ADDRESS__ + " SANITIZE_ADDRESS=YES" +#endif /* __SANITIZE_ADDRESS__ */ +#ifdef MDBX_USE_VALGRIND + " MDBX_USE_VALGRIND=YES" +#endif /* MDBX_USE_VALGRIND */ +#if MDBX_FORCE_ASSERTIONS + " MDBX_FORCE_ASSERTIONS=YES" +#endif /* MDBX_FORCE_ASSERTIONS */ +#ifdef _GNU_SOURCE + " _GNU_SOURCE=YES" +#else + " _GNU_SOURCE=NO" +#endif /* _GNU_SOURCE */ +#ifdef __APPLE__ + " MDBX_OSX_SPEED_INSTEADOF_DURABILITY=" STRINGIFY(MDBX_OSX_SPEED_INSTEADOF_DURABILITY) +#endif /* MacOS */ +#if defined(_WIN32) || defined(_WIN64) + " MDBX_WITHOUT_MSVC_CRT=" STRINGIFY(MDBX_AVOID_CRT) + " MDBX_BUILD_SHARED_LIBRARY=" STRINGIFY(MDBX_BUILD_SHARED_LIBRARY) +#if !MDBX_BUILD_SHARED_LIBRARY + " MDBX_MANUAL_MODULE_HANDLER=" STRINGIFY(MDBX_MANUAL_MODULE_HANDLER) +#endif + " WINVER=" STRINGIFY(WINVER) +#else /* Windows */ + " MDBX_LOCKING=" MDBX_LOCKING_CONFIG + " MDBX_USE_OFDLOCKS=" MDBX_USE_OFDLOCKS_CONFIG +#endif /* !Windows */ + " MDBX_CACHELINE_SIZE=" STRINGIFY(MDBX_CACHELINE_SIZE) + " MDBX_CPU_WRITEBACK_INCOHERENT=" STRINGIFY(MDBX_CPU_WRITEBACK_INCOHERENT) + " MDBX_MMAP_INCOHERENT_CPU_CACHE=" STRINGIFY(MDBX_MMAP_INCOHERENT_CPU_CACHE) + " MDBX_MMAP_INCOHERENT_FILE_WRITE=" STRINGIFY(MDBX_MMAP_INCOHERENT_FILE_WRITE) + " MDBX_UNALIGNED_OK=" STRINGIFY(MDBX_UNALIGNED_OK) + " MDBX_PNL_ASCENDING=" STRINGIFY(MDBX_PNL_ASCENDING) + , +#ifdef MDBX_BUILD_COMPILER + MDBX_BUILD_COMPILER +#else + #ifdef __INTEL_COMPILER + "Intel C/C++ " STRINGIFY(__INTEL_COMPILER) + #elif defined(__apple_build_version__) + "Apple clang " STRINGIFY(__apple_build_version__) + #elif defined(__ibmxl__) + "IBM clang C " STRINGIFY(__ibmxl_version__) "." STRINGIFY(__ibmxl_release__) + "." STRINGIFY(__ibmxl_modification__) "." STRINGIFY(__ibmxl_ptf_fix_level__) + #elif defined(__clang__) + "clang " STRINGIFY(__clang_version__) + #elif defined(__MINGW64__) + "MINGW-64 " STRINGIFY(__MINGW64_MAJOR_VERSION) "." STRINGIFY(__MINGW64_MINOR_VERSION) + #elif defined(__MINGW32__) + "MINGW-32 " STRINGIFY(__MINGW32_MAJOR_VERSION) "." STRINGIFY(__MINGW32_MINOR_VERSION) + #elif defined(__IBMC__) + "IBM C " STRINGIFY(__IBMC__) + #elif defined(__GNUC__) + "GNU C/C++ " + #ifdef __VERSION__ + __VERSION__ + #else + STRINGIFY(__GNUC__) "." STRINGIFY(__GNUC_MINOR__) "." STRINGIFY(__GNUC_PATCHLEVEL__) + #endif + #elif defined(_MSC_VER) + "MSVC " STRINGIFY(_MSC_FULL_VER) "-" STRINGIFY(_MSC_BUILD) + #else + "Unknown compiler" + #endif +#endif /* MDBX_BUILD_COMPILER */ + , +#ifdef MDBX_BUILD_FLAGS_CONFIG + MDBX_BUILD_FLAGS_CONFIG +#endif /* MDBX_BUILD_FLAGS_CONFIG */ +#ifdef MDBX_BUILD_FLAGS + MDBX_BUILD_FLAGS +#endif /* MDBX_BUILD_FLAGS */ +#if !(defined(MDBX_BUILD_FLAGS_CONFIG) || defined(MDBX_BUILD_FLAGS)) + "undefined (please use correct build script)" +#ifdef _MSC_VER +#pragma message("warning: Build flags undefined. Please use correct build script") +#else +#warning "Build flags undefined. Please use correct build script" +#endif // _MSC_VER +#endif +}; + +#ifdef __SANITIZE_ADDRESS__ +LIBMDBX_API __attribute__((__weak__)) const char *__asan_default_options() { + return "symbolize=1:allow_addr2line=1:" +#ifdef _DEBUG + "debug=1:" +#endif /* _DEBUG */ + "report_globals=1:" + "replace_str=1:replace_intrin=1:" + "malloc_context_size=9:" + "detect_leaks=1:" + "check_printf=1:" + "detect_deadlocks=1:" +#ifndef LTO_ENABLED + "check_initialization_order=1:" +#endif + "detect_stack_use_after_return=1:" + "intercept_tls_get_addr=1:" + "decorate_proc_maps=1:" + "abort_on_error=1"; +} +#endif /* __SANITIZE_ADDRESS__ */ + +/* *INDENT-ON* */ +/* clang-format on */ +/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ + +/* + * Copyright 2015-2021 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + + +#if defined(_WIN32) || defined(_WIN64) + +#include + +static int waitstatus2errcode(DWORD result) { + switch (result) { + case WAIT_OBJECT_0: + return MDBX_SUCCESS; + case WAIT_FAILED: + return (int)GetLastError(); + case WAIT_ABANDONED: + return ERROR_ABANDONED_WAIT_0; + case WAIT_IO_COMPLETION: + return ERROR_USER_APC; + case WAIT_TIMEOUT: + return ERROR_TIMEOUT; + default: + return ERROR_UNHANDLED_ERROR; + } +} + +/* Map a result from an NTAPI call to WIN32 error code. */ +static int ntstatus2errcode(NTSTATUS status) { + DWORD dummy; + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); + ov.Internal = status; + return GetOverlappedResult(NULL, &ov, &dummy, FALSE) ? MDBX_SUCCESS + : (int)GetLastError(); +} + +/* We use native NT APIs to setup the memory map, so that we can + * let the DB file grow incrementally instead of always preallocating + * the full size. These APIs are defined in and + * but those headers are meant for driver-level development and + * conflict with the regular user-level headers, so we explicitly + * declare them here. Using these APIs also means we must link to + * ntdll.dll, which is not linked by default in user code. */ + +extern NTSTATUS NTAPI NtCreateSection( + OUT PHANDLE SectionHandle, IN ACCESS_MASK DesiredAccess, + IN OPTIONAL POBJECT_ATTRIBUTES ObjectAttributes, + IN OPTIONAL PLARGE_INTEGER MaximumSize, IN ULONG SectionPageProtection, + IN ULONG AllocationAttributes, IN OPTIONAL HANDLE FileHandle); + +typedef struct _SECTION_BASIC_INFORMATION { + ULONG Unknown; + ULONG SectionAttributes; + LARGE_INTEGER SectionSize; +} SECTION_BASIC_INFORMATION, *PSECTION_BASIC_INFORMATION; + +extern NTSTATUS NTAPI NtMapViewOfSection( + IN HANDLE SectionHandle, IN HANDLE ProcessHandle, IN OUT PVOID *BaseAddress, + IN ULONG_PTR ZeroBits, IN SIZE_T CommitSize, + IN OUT OPTIONAL PLARGE_INTEGER SectionOffset, IN OUT PSIZE_T ViewSize, + IN SECTION_INHERIT InheritDisposition, IN ULONG AllocationType, + IN ULONG Win32Protect); + +extern NTSTATUS NTAPI NtUnmapViewOfSection(IN HANDLE ProcessHandle, + IN OPTIONAL PVOID BaseAddress); + +extern NTSTATUS NTAPI NtClose(HANDLE Handle); + +extern NTSTATUS NTAPI NtAllocateVirtualMemory( + IN HANDLE ProcessHandle, IN OUT PVOID *BaseAddress, IN ULONG_PTR ZeroBits, + IN OUT PSIZE_T RegionSize, IN ULONG AllocationType, IN ULONG Protect); + +extern NTSTATUS NTAPI NtFreeVirtualMemory(IN HANDLE ProcessHandle, + IN PVOID *BaseAddress, + IN OUT PSIZE_T RegionSize, + IN ULONG FreeType); + +#ifndef WOF_CURRENT_VERSION +typedef struct _WOF_EXTERNAL_INFO { + DWORD Version; + DWORD Provider; +} WOF_EXTERNAL_INFO, *PWOF_EXTERNAL_INFO; +#endif /* WOF_CURRENT_VERSION */ + +#ifndef WIM_PROVIDER_CURRENT_VERSION +#define WIM_PROVIDER_HASH_SIZE 20 + +typedef struct _WIM_PROVIDER_EXTERNAL_INFO { + DWORD Version; + DWORD Flags; + LARGE_INTEGER DataSourceId; + BYTE ResourceHash[WIM_PROVIDER_HASH_SIZE]; +} WIM_PROVIDER_EXTERNAL_INFO, *PWIM_PROVIDER_EXTERNAL_INFO; +#endif /* WIM_PROVIDER_CURRENT_VERSION */ + +#ifndef FILE_PROVIDER_CURRENT_VERSION +typedef struct _FILE_PROVIDER_EXTERNAL_INFO_V1 { + ULONG Version; + ULONG Algorithm; + ULONG Flags; +} FILE_PROVIDER_EXTERNAL_INFO_V1, *PFILE_PROVIDER_EXTERNAL_INFO_V1; +#endif /* FILE_PROVIDER_CURRENT_VERSION */ + +#ifndef STATUS_OBJECT_NOT_EXTERNALLY_BACKED +#define STATUS_OBJECT_NOT_EXTERNALLY_BACKED ((NTSTATUS)0xC000046DL) +#endif +#ifndef STATUS_INVALID_DEVICE_REQUEST +#define STATUS_INVALID_DEVICE_REQUEST ((NTSTATUS)0xC0000010L) +#endif +#ifndef STATUS_NOT_SUPPORTED +#define STATUS_NOT_SUPPORTED ((NTSTATUS)0xC00000BBL) +#endif + +#ifndef FILE_DEVICE_FILE_SYSTEM +#define FILE_DEVICE_FILE_SYSTEM 0x00000009 +#endif + +#ifndef FSCTL_GET_EXTERNAL_BACKING +#define FSCTL_GET_EXTERNAL_BACKING \ + CTL_CODE(FILE_DEVICE_FILE_SYSTEM, 196, METHOD_BUFFERED, FILE_ANY_ACCESS) +#endif + +#ifndef ERROR_NOT_CAPABLE +#define ERROR_NOT_CAPABLE 775L +#endif + +#endif /* _WIN32 || _WIN64 */ + +/*----------------------------------------------------------------------------*/ + +#if defined(__UCLIBC__) +__extern_C void __assert(const char *, const char *, unsigned int, const char *) +#ifdef __THROW + __THROW +#else + __nothrow +#endif /* __THROW */ + MDBX_NORETURN; +#define __assert_fail(assertion, file, line, function) \ + __assert(assertion, file, line, function) + +#elif _POSIX_C_SOURCE > 200212 && \ + /* workaround for avoid musl libc wrong prototype */ ( \ + defined(__GLIBC__) || defined(__GNU_LIBRARY__)) +/* Prototype should match libc runtime. ISO POSIX (2003) & LSB 1.x-3.x */ +__extern_C void __assert_fail(const char *assertion, const char *file, + unsigned line, const char *function) +#ifdef __THROW + __THROW +#else + __nothrow +#endif /* __THROW */ + MDBX_NORETURN; + +#elif defined(__APPLE__) || defined(__MACH__) +__extern_C void __assert_rtn(const char *function, const char *file, int line, + const char *assertion) /* __nothrow */ +#ifdef __dead2 + __dead2 +#else + MDBX_NORETURN +#endif /* __dead2 */ +#ifdef __disable_tail_calls + __disable_tail_calls +#endif /* __disable_tail_calls */ + ; + +#define __assert_fail(assertion, file, line, function) \ + __assert_rtn(function, file, line, assertion) +#elif defined(__sun) || defined(__SVR4) || defined(__svr4__) +__extern_C void __assert_c99(const char *assection, const char *file, int line, + const char *function) MDBX_NORETURN; +#define __assert_fail(assertion, file, line, function) \ + __assert_c99(assertion, file, line, function) +#elif defined(__OpenBSD__) +__extern_C __dead void __assert2(const char *file, int line, + const char *function, + const char *assertion) /* __nothrow */; +#define __assert_fail(assertion, file, line, function) \ + __assert2(file, line, function, assertion) +#elif defined(__NetBSD__) +__extern_C __dead void __assert13(const char *file, int line, + const char *function, + const char *assertion) /* __nothrow */; +#define __assert_fail(assertion, file, line, function) \ + __assert13(file, line, function, assertion) +#elif defined(__FreeBSD__) || defined(__BSD__) || defined(__bsdi__) || \ + defined(__DragonFly__) +__extern_C void __assert(const char *function, const char *file, int line, + const char *assertion) /* __nothrow */ +#ifdef __dead2 + __dead2 +#else + MDBX_NORETURN +#endif /* __dead2 */ +#ifdef __disable_tail_calls + __disable_tail_calls +#endif /* __disable_tail_calls */ + ; +#define __assert_fail(assertion, file, line, function) \ + __assert(function, file, line, assertion) + +#endif /* __assert_fail */ + +#if !defined(__ANDROID_API__) || MDBX_DEBUG + +void __cold mdbx_assert_fail(const MDBX_env *env, const char *msg, + const char *func, int line) { +#if MDBX_DEBUG + if (env && env->me_assert_func) { + env->me_assert_func(env, msg, func, line); + return; + } +#else + (void)env; +#endif /* MDBX_DEBUG */ + + if (mdbx_debug_logger) + mdbx_debug_log(MDBX_LOG_FATAL, func, line, "assert: %s\n", msg); + else { +#if defined(_WIN32) || defined(_WIN64) + char *message = nullptr; + const int num = mdbx_asprintf(&message, "\r\nMDBX-ASSERTION: %s, %s:%u", + msg, func ? func : "unknown", line); + if (num < 1 || !message) + message = ""; + OutputDebugStringA(message); + if (IsDebuggerPresent()) + DebugBreak(); +#elif defined(__ANDROID_API__) + __android_log_assert(msg, "mdbx", "%s:%u", func, line); +#else + __assert_fail(msg, "mdbx", line, func); +#endif + } + +#if defined(_WIN32) || defined(_WIN64) + FatalExit(ERROR_UNHANDLED_ERROR); +#else + abort(); +#endif +} + +#endif /* __ANDROID_API__ || MDBX_DEBUG */ + +__cold void mdbx_panic(const char *fmt, ...) { + va_list ap; + va_start(ap, fmt); + + char *message = nullptr; + const int num = mdbx_vasprintf(&message, fmt, ap); + va_end(ap); + const char *const const_message = + (num < 1 || !message) ? "" + : message; + +#if defined(_WIN32) || defined(_WIN64) + OutputDebugStringA("\r\nMDBX-PANIC: "); + OutputDebugStringA(const_message); + if (IsDebuggerPresent()) + DebugBreak(); + FatalExit(ERROR_UNHANDLED_ERROR); +#else +#if defined(__ANDROID_API__) + __android_log_assert("panic", "mdbx", "%s", const_message); +#else + __assert_fail(const_message, "mdbx", 0, "panic"); +#endif /* __ANDROID_API__ */ + abort(); +#endif +} + +/*----------------------------------------------------------------------------*/ + +#ifndef mdbx_vasprintf +MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, + va_list ap) { + va_list ones; + va_copy(ones, ap); + int needed = vsnprintf(nullptr, 0, fmt, ap); + + if (unlikely(needed < 0 || needed >= INT_MAX)) { + *strp = nullptr; + va_end(ones); + return needed; + } + + *strp = mdbx_malloc(needed + 1); + if (unlikely(*strp == nullptr)) { + va_end(ones); +#if defined(_WIN32) || defined(_WIN64) + SetLastError(MDBX_ENOMEM); +#else + errno = MDBX_ENOMEM; +#endif + return -1; + } + + int actual = vsnprintf(*strp, needed + 1, fmt, ones); + va_end(ones); + + assert(actual == needed); + if (unlikely(actual < 0)) { + mdbx_free(*strp); + *strp = nullptr; + } + return actual; +} +#endif /* mdbx_vasprintf */ + +#ifndef mdbx_asprintf +MDBX_INTERNAL_FUNC int mdbx_asprintf(char **strp, const char *fmt, ...) { + va_list ap; + va_start(ap, fmt); + int rc = mdbx_vasprintf(strp, fmt, ap); + va_end(ap); + return rc; +} +#endif /* mdbx_asprintf */ + +#ifndef mdbx_memalign_alloc +MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes, + void **result) { + assert(is_powerof2(alignment) && alignment >= sizeof(void *)); +#if defined(_WIN32) || defined(_WIN64) + (void)alignment; + *result = VirtualAlloc(NULL, bytes, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); + return *result ? MDBX_SUCCESS : MDBX_ENOMEM /* ERROR_OUTOFMEMORY */; +#elif defined(_ISOC11_SOURCE) + *result = aligned_alloc(alignment, ceil_powerof2(bytes, alignment)); + return *result ? MDBX_SUCCESS : errno; +#elif _POSIX_VERSION >= 200112L && \ + (!defined(__ANDROID_API__) || __ANDROID_API__ >= 17) + *result = nullptr; + return posix_memalign(result, alignment, bytes); +#elif __GLIBC_PREREQ(2, 16) || __STDC_VERSION__ >= 201112L + *result = memalign(alignment, bytes); + return *result ? MDBX_SUCCESS : errno; +#else +#error FIXME +#endif +} +#endif /* mdbx_memalign_alloc */ + +#ifndef mdbx_memalign_free +MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr) { +#if defined(_WIN32) || defined(_WIN64) + VirtualFree(ptr, 0, MEM_RELEASE); +#else + mdbx_free(ptr); +#endif +} +#endif /* mdbx_memalign_free */ + +#ifndef mdbx_strdup +char *mdbx_strdup(const char *str) { + if (!str) + return NULL; + size_t bytes = strlen(str) + 1; + char *dup = mdbx_malloc(bytes); + if (dup) + memcpy(dup, str, bytes); + return dup; +} +#endif /* mdbx_strdup */ + +/*----------------------------------------------------------------------------*/ + +MDBX_INTERNAL_FUNC int mdbx_condpair_init(mdbx_condpair_t *condpair) { + int rc; + memset(condpair, 0, sizeof(mdbx_condpair_t)); +#if defined(_WIN32) || defined(_WIN64) + if ((condpair->mutex = CreateMutexW(NULL, FALSE, NULL)) == NULL) { + rc = (int)GetLastError(); + goto bailout_mutex; + } + if ((condpair->event[0] = CreateEventW(NULL, FALSE, FALSE, NULL)) == NULL) { + rc = (int)GetLastError(); + goto bailout_event; + } + if ((condpair->event[1] = CreateEventW(NULL, FALSE, FALSE, NULL)) != NULL) + return MDBX_SUCCESS; + + rc = (int)GetLastError(); + (void)CloseHandle(condpair->event[0]); +bailout_event: + (void)CloseHandle(condpair->mutex); +#else + rc = pthread_mutex_init(&condpair->mutex, NULL); + if (unlikely(rc != 0)) + goto bailout_mutex; + rc = pthread_cond_init(&condpair->cond[0], NULL); + if (unlikely(rc != 0)) + goto bailout_cond; + rc = pthread_cond_init(&condpair->cond[1], NULL); + if (likely(rc == 0)) + return MDBX_SUCCESS; + + (void)pthread_cond_destroy(&condpair->cond[0]); +bailout_cond: + (void)pthread_mutex_destroy(&condpair->mutex); +#endif +bailout_mutex: + memset(condpair, 0, sizeof(mdbx_condpair_t)); + return rc; +} + +MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair) { +#if defined(_WIN32) || defined(_WIN64) + int rc = CloseHandle(condpair->mutex) ? MDBX_SUCCESS : (int)GetLastError(); + rc = CloseHandle(condpair->event[0]) ? rc : (int)GetLastError(); + rc = CloseHandle(condpair->event[1]) ? rc : (int)GetLastError(); +#else + int err, rc = pthread_mutex_destroy(&condpair->mutex); + rc = (err = pthread_cond_destroy(&condpair->cond[0])) ? err : rc; + rc = (err = pthread_cond_destroy(&condpair->cond[1])) ? err : rc; +#endif + memset(condpair, 0, sizeof(mdbx_condpair_t)); + return rc; +} + +MDBX_INTERNAL_FUNC int mdbx_condpair_lock(mdbx_condpair_t *condpair) { +#if defined(_WIN32) || defined(_WIN64) + DWORD code = WaitForSingleObject(condpair->mutex, INFINITE); + return waitstatus2errcode(code); +#else + return pthread_mutex_lock(&condpair->mutex); +#endif +} + +MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair) { +#if defined(_WIN32) || defined(_WIN64) + return ReleaseMutex(condpair->mutex) ? MDBX_SUCCESS : (int)GetLastError(); +#else + return pthread_mutex_unlock(&condpair->mutex); +#endif +} + +MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair, + bool part) { +#if defined(_WIN32) || defined(_WIN64) + return SetEvent(condpair->event[part]) ? MDBX_SUCCESS : (int)GetLastError(); +#else + return pthread_cond_signal(&condpair->cond[part]); +#endif +} + +MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, + bool part) { +#if defined(_WIN32) || defined(_WIN64) + DWORD code = SignalObjectAndWait(condpair->mutex, condpair->event[part], + INFINITE, FALSE); + if (code == WAIT_OBJECT_0) { + code = WaitForSingleObject(condpair->mutex, INFINITE); + if (code == WAIT_OBJECT_0) + return MDBX_SUCCESS; + } + return waitstatus2errcode(code); +#else + return pthread_cond_wait(&condpair->cond[part], &condpair->mutex); +#endif +} + +/*----------------------------------------------------------------------------*/ + +MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex) { +#if defined(_WIN32) || defined(_WIN64) + InitializeCriticalSection(fastmutex); + return MDBX_SUCCESS; +#else + return pthread_mutex_init(fastmutex, NULL); +#endif +} + +MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex) { +#if defined(_WIN32) || defined(_WIN64) + DeleteCriticalSection(fastmutex); + return MDBX_SUCCESS; +#else + return pthread_mutex_destroy(fastmutex); +#endif +} + +MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex) { +#if defined(_WIN32) || defined(_WIN64) + __try { + EnterCriticalSection(fastmutex); + } __except ( + (GetExceptionCode() == + 0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */) + ? EXCEPTION_EXECUTE_HANDLER + : EXCEPTION_CONTINUE_SEARCH) { + return ERROR_POSSIBLE_DEADLOCK; + } + return MDBX_SUCCESS; +#else + return pthread_mutex_lock(fastmutex); +#endif +} + +MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex) { +#if defined(_WIN32) || defined(_WIN64) + LeaveCriticalSection(fastmutex); + return MDBX_SUCCESS; +#else + return pthread_mutex_unlock(fastmutex); +#endif +} + +/*----------------------------------------------------------------------------*/ + +MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname) { +#if defined(_WIN32) || defined(_WIN64) + const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); + if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) + return ERROR_INVALID_NAME; + wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); + if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) + return ERROR_INVALID_NAME; + return DeleteFileW(pathnameW) ? MDBX_SUCCESS : (int)GetLastError(); +#else + return unlink(pathname) ? errno : MDBX_SUCCESS; +#endif +} + +#if !(defined(_WIN32) || defined(_WIN64)) +static bool is_valid_fd(int fd) { return !(isatty(fd) < 0 && errno == EBADF); } +#endif /*! Windows */ + +MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname) { +#if defined(_WIN32) || defined(_WIN64) + const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); + if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) + return ERROR_INVALID_NAME; + wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); + if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) + return ERROR_INVALID_NAME; + return RemoveDirectoryW(pathnameW) ? MDBX_SUCCESS : (int)GetLastError(); +#else + return rmdir(pathname) ? errno : MDBX_SUCCESS; +#endif +} + +MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, + const MDBX_env *env, const char *pathname, + mdbx_filehandle_t *fd, + mdbx_mode_t unix_mode_bits) { + *fd = INVALID_HANDLE_VALUE; + +#if defined(_WIN32) || defined(_WIN64) + const size_t wlen = mbstowcs(nullptr, pathname, INT_MAX); + if (wlen < 1 || wlen > /* MAX_PATH */ INT16_MAX) + return ERROR_INVALID_NAME; + wchar_t *const pathnameW = _alloca((wlen + 1) * sizeof(wchar_t)); + if (wlen != mbstowcs(pathnameW, pathname, wlen + 1)) + return ERROR_INVALID_NAME; + + DWORD CreationDisposition = unix_mode_bits ? OPEN_ALWAYS : OPEN_EXISTING; + DWORD FlagsAndAttributes = + FILE_FLAG_POSIX_SEMANTICS | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED; + DWORD DesiredAccess = FILE_READ_ATTRIBUTES; + DWORD ShareMode = (env->me_flags & MDBX_EXCLUSIVE) + ? 0 + : (FILE_SHARE_READ | FILE_SHARE_WRITE); + + switch (purpose) { + default: + return ERROR_INVALID_PARAMETER; + case MDBX_OPEN_LCK: + CreationDisposition = OPEN_ALWAYS; + DesiredAccess |= GENERIC_READ | GENERIC_WRITE; + FlagsAndAttributes |= FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_TEMPORARY; + break; + case MDBX_OPEN_DXB_READ: + CreationDisposition = OPEN_EXISTING; + DesiredAccess |= GENERIC_READ; + ShareMode |= FILE_SHARE_READ; + break; + case MDBX_OPEN_DXB_LAZY: + DesiredAccess |= GENERIC_READ | GENERIC_WRITE; + break; + case MDBX_OPEN_DXB_DSYNC: + CreationDisposition = OPEN_EXISTING; + DesiredAccess |= GENERIC_WRITE; + FlagsAndAttributes |= FILE_FLAG_WRITE_THROUGH; + break; + case MDBX_OPEN_COPY: + CreationDisposition = CREATE_NEW; + ShareMode = 0; + DesiredAccess |= GENERIC_WRITE; + FlagsAndAttributes |= + (env->me_psize < env->me_os_psize) ? 0 : FILE_FLAG_NO_BUFFERING; + break; + case MDBX_OPEN_DELETE: + CreationDisposition = OPEN_EXISTING; + ShareMode |= FILE_SHARE_DELETE; + DesiredAccess = + FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES | DELETE | SYNCHRONIZE; + break; + } + + *fd = CreateFileW(pathnameW, DesiredAccess, ShareMode, NULL, + CreationDisposition, FlagsAndAttributes, NULL); + if (*fd == INVALID_HANDLE_VALUE) + return (int)GetLastError(); + + BY_HANDLE_FILE_INFORMATION info; + if (!GetFileInformationByHandle(*fd, &info)) { + int err = (int)GetLastError(); + CloseHandle(*fd); + *fd = INVALID_HANDLE_VALUE; + return err; + } + const DWORD AttributesDiff = + (info.dwFileAttributes ^ FlagsAndAttributes) & + (FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED | + FILE_ATTRIBUTE_TEMPORARY | FILE_ATTRIBUTE_COMPRESSED); + if (AttributesDiff) + (void)SetFileAttributesW(pathnameW, info.dwFileAttributes ^ AttributesDiff); + +#else + int flags = unix_mode_bits ? O_CREAT : 0; + switch (purpose) { + default: + return EINVAL; + case MDBX_OPEN_LCK: + flags |= O_RDWR; + break; + case MDBX_OPEN_DXB_READ: + flags = O_RDONLY; + break; + case MDBX_OPEN_DXB_LAZY: + flags |= O_RDWR; + break; + case MDBX_OPEN_COPY: + flags = O_CREAT | O_WRONLY | O_EXCL; + break; + case MDBX_OPEN_DXB_DSYNC: + flags |= O_WRONLY; +#if defined(O_DSYNC) + flags |= O_DSYNC; +#elif defined(O_SYNC) + flags |= O_SYNC; +#elif defined(O_FSYNC) + flags |= O_FSYNC; +#endif + break; + case MDBX_OPEN_DELETE: + flags = O_RDWR; + break; + } + + const bool direct_nocache_for_copy = + env->me_psize >= env->me_os_psize && purpose == MDBX_OPEN_COPY; + if (direct_nocache_for_copy) { +#if defined(O_DIRECT) + flags |= O_DIRECT; +#endif /* O_DIRECT */ +#if defined(O_NOCACHE) + flags |= O_NOCACHE; +#endif /* O_NOCACHE */ + } + +#ifdef O_CLOEXEC + flags |= O_CLOEXEC; +#endif /* O_CLOEXEC */ + + /* Safeguard for https://github.com/erthink/libmdbx/issues/144 */ +#if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2 + int stub_fd0 = -1, stub_fd1 = -1, stub_fd2 = -1; + static const char dev_null[] = "/dev/null"; + if (!is_valid_fd(STDIN_FILENO)) { + mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub", "IN", + STDIN_FILENO, dev_null); + stub_fd0 = open(dev_null, O_RDONLY | O_NOCTTY); + } + if (!is_valid_fd(STDOUT_FILENO)) { + mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub", + "OUT", STDOUT_FILENO, dev_null); + stub_fd1 = open(dev_null, O_WRONLY | O_NOCTTY); + } + if (!is_valid_fd(STDERR_FILENO)) { + mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub", + "ERR", STDERR_FILENO, dev_null); + stub_fd2 = open(dev_null, O_WRONLY | O_NOCTTY); + } +#else +#error "Unexpected or unsupported UNIX or POSIX system" +#endif /* STDIN_FILENO == 0 && STDERR_FILENO == 2 */ + + *fd = open(pathname, flags, unix_mode_bits); +#if defined(O_DIRECT) + if (*fd < 0 && (flags & O_DIRECT) && + (errno == EINVAL || errno == EAFNOSUPPORT)) { + flags &= ~(O_DIRECT | O_EXCL); + *fd = open(pathname, flags, unix_mode_bits); + } +#endif /* O_DIRECT */ + + /* Safeguard for https://github.com/erthink/libmdbx/issues/144 */ +#if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2 + if (*fd == STDIN_FILENO) { + mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "IN", + STDIN_FILENO); + assert(stub_fd0 == -1); + *fd = dup(stub_fd0 = *fd); + } + if (*fd == STDOUT_FILENO) { + mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "OUT", + STDOUT_FILENO); + assert(stub_fd1 == -1); + *fd = dup(stub_fd1 = *fd); + } + if (*fd == STDERR_FILENO) { + mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "ERR", + STDERR_FILENO); + assert(stub_fd2 == -1); + *fd = dup(stub_fd2 = *fd); + } + if (stub_fd0 != -1) + close(stub_fd0); + if (stub_fd1 != -1) + close(stub_fd1); + if (stub_fd2 != -1) + close(stub_fd2); + if (*fd >= STDIN_FILENO && *fd <= STDERR_FILENO) { + mdbx_error( + "Rejecting the use of a FD in the range " + "STDIN_FILENO/%d..STDERR_FILENO/%d to prevent database corruption", + STDIN_FILENO, STDERR_FILENO); + close(*fd); + return EBADF; + } +#else +#error "Unexpected or unsupported UNIX or POSIX system" +#endif /* STDIN_FILENO == 0 && STDERR_FILENO == 2 */ + + if (*fd < 0) + return errno; + +#if defined(FD_CLOEXEC) && !defined(O_CLOEXEC) + const int fd_flags = fcntl(*fd, F_GETFD); + if (fd_flags != -1) + (void)fcntl(*fd, F_SETFD, fd_flags | FD_CLOEXEC); +#endif /* FD_CLOEXEC && !O_CLOEXEC */ + + if (direct_nocache_for_copy) { +#if defined(F_NOCACHE) && !defined(O_NOCACHE) + (void)fcntl(*fd, F_NOCACHE, 1); +#endif /* F_NOCACHE */ + } + +#endif + return MDBX_SUCCESS; +} + +MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd) { +#if defined(_WIN32) || defined(_WIN64) + return CloseHandle(fd) ? MDBX_SUCCESS : (int)GetLastError(); +#else + assert(fd > STDERR_FILENO); + return (close(fd) == 0) ? MDBX_SUCCESS : errno; +#endif +} + +MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, + uint64_t offset) { + if (bytes > MAX_WRITE) + return MDBX_EINVAL; +#if defined(_WIN32) || defined(_WIN64) + OVERLAPPED ov; + ov.hEvent = 0; + ov.Offset = (DWORD)offset; + ov.OffsetHigh = HIGH_DWORD(offset); + + DWORD read = 0; + if (unlikely(!ReadFile(fd, buf, (DWORD)bytes, &read, &ov))) { + int rc = (int)GetLastError(); + return (rc == MDBX_SUCCESS) ? /* paranoia */ ERROR_READ_FAULT : rc; + } +#else + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + intptr_t read = pread(fd, buf, bytes, offset); + if (read < 0) { + int rc = errno; + return (rc == MDBX_SUCCESS) ? /* paranoia */ MDBX_EIO : rc; + } +#endif + return (bytes == (size_t)read) ? MDBX_SUCCESS : MDBX_ENODATA; +} + +MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, + size_t bytes, uint64_t offset) { + while (true) { +#if defined(_WIN32) || defined(_WIN64) + OVERLAPPED ov; + ov.hEvent = 0; + ov.Offset = (DWORD)offset; + ov.OffsetHigh = HIGH_DWORD(offset); + + DWORD written; + if (unlikely(!WriteFile( + fd, buf, likely(bytes <= MAX_WRITE) ? (DWORD)bytes : MAX_WRITE, + &written, &ov))) + return (int)GetLastError(); + if (likely(bytes == written)) + return MDBX_SUCCESS; +#else + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + const intptr_t written = + pwrite(fd, buf, likely(bytes <= MAX_WRITE) ? bytes : MAX_WRITE, offset); + if (likely(bytes == (size_t)written)) + return MDBX_SUCCESS; + if (written < 0) { + const int rc = errno; + if (rc != EINTR) + return rc; + continue; + } +#endif + bytes -= written; + offset += written; + buf = (char *)buf + written; + } +} + +MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf, + size_t bytes) { + while (true) { +#if defined(_WIN32) || defined(_WIN64) + DWORD written; + if (unlikely(!WriteFile( + fd, buf, likely(bytes <= MAX_WRITE) ? (DWORD)bytes : MAX_WRITE, + &written, nullptr))) + return (int)GetLastError(); + if (likely(bytes == written)) + return MDBX_SUCCESS; +#else + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + const intptr_t written = + write(fd, buf, likely(bytes <= MAX_WRITE) ? bytes : MAX_WRITE); + if (likely(bytes == (size_t)written)) + return MDBX_SUCCESS; + if (written < 0) { + const int rc = errno; + if (rc != EINTR) + return rc; + continue; + } +#endif + bytes -= written; + buf = (char *)buf + written; + } +} + +int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, + uint64_t offset, size_t expected_written) { +#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || \ + (defined(__ANDROID_API__) && __ANDROID_API__ < 24) + size_t written = 0; + for (int i = 0; i < iovcnt; ++i) { + int rc = mdbx_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + written += iov[i].iov_len; + offset += iov[i].iov_len; + } + return (expected_written == written) ? MDBX_SUCCESS + : MDBX_EIO /* ERROR_WRITE_FAULT */; +#else + int rc; + intptr_t written; + do { + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + written = pwritev(fd, iov, iovcnt, offset); + if (likely(expected_written == (size_t)written)) + return MDBX_SUCCESS; + rc = errno; + } while (rc == EINTR); + return (written < 0) ? rc : MDBX_EIO /* Use which error code? */; +#endif +} + +MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, + enum mdbx_syncmode_bits mode_bits) { +#if defined(_WIN32) || defined(_WIN64) + if ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) && !FlushFileBuffers(fd)) + return (int)GetLastError(); + return MDBX_SUCCESS; +#else + +#if defined(__APPLE__) && \ + MDBX_OSX_SPEED_INSTEADOF_DURABILITY == MDBX_OSX_WANNA_DURABILITY + if (mode_bits & MDBX_SYNC_IODQ) + return likely(fcntl(fd, F_FULLFSYNC) != -1) ? MDBX_SUCCESS : errno; +#endif /* MacOS */ + + /* LY: This approach is always safe and without appreciable performance + * degradation, even on a kernel with fdatasync's bug. + * + * For more info about of a corresponding fdatasync() bug + * see http://www.spinics.net/lists/linux-ext4/msg33714.html */ + while (1) { + switch (mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_SIZE)) { + case MDBX_SYNC_NONE: + return MDBX_SUCCESS /* nothing to do */; +#if defined(_POSIX_SYNCHRONIZED_IO) && _POSIX_SYNCHRONIZED_IO > 0 + case MDBX_SYNC_DATA: + if (fdatasync(fd) == 0) + return MDBX_SUCCESS; + break /* error */; +#if defined(__linux__) || defined(__gnu_linux__) + case MDBX_SYNC_SIZE: + if (mdbx_linux_kernel_version >= 0x03060000) + return MDBX_SUCCESS; + __fallthrough /* fall through */; +#endif /* Linux */ +#endif /* _POSIX_SYNCHRONIZED_IO > 0 */ + default: + if (fsync(fd) == 0) + return MDBX_SUCCESS; + } + + int rc = errno; + if (rc != EINTR) + return rc; + } +#endif +} + +int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length) { +#if defined(_WIN32) || defined(_WIN64) + BY_HANDLE_FILE_INFORMATION info; + if (!GetFileInformationByHandle(fd, &info)) + return (int)GetLastError(); + *length = info.nFileSizeLow | (uint64_t)info.nFileSizeHigh << 32; +#else + struct stat st; + + STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(uint64_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + if (fstat(fd, &st)) + return errno; + + *length = st.st_size; +#endif + return MDBX_SUCCESS; +} + +MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd) { +#if defined(_WIN32) || defined(_WIN64) + switch (GetFileType(fd)) { + case FILE_TYPE_DISK: + return MDBX_RESULT_FALSE; + case FILE_TYPE_CHAR: + case FILE_TYPE_PIPE: + return MDBX_RESULT_TRUE; + default: + return (int)GetLastError(); + } +#else + struct stat info; + if (fstat(fd, &info)) + return errno; + switch (info.st_mode & S_IFMT) { + case S_IFBLK: + case S_IFREG: + return MDBX_RESULT_FALSE; + case S_IFCHR: + case S_IFIFO: + case S_IFSOCK: + return MDBX_RESULT_TRUE; + case S_IFDIR: + case S_IFLNK: + default: + return MDBX_INCOMPATIBLE; + } +#endif +} + +MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length) { +#if defined(_WIN32) || defined(_WIN64) + if (mdbx_SetFileInformationByHandle) { + FILE_END_OF_FILE_INFO EndOfFileInfo; + EndOfFileInfo.EndOfFile.QuadPart = length; + return mdbx_SetFileInformationByHandle(fd, FileEndOfFileInfo, + &EndOfFileInfo, + sizeof(FILE_END_OF_FILE_INFO)) + ? MDBX_SUCCESS + : (int)GetLastError(); + } else { + LARGE_INTEGER li; + li.QuadPart = length; + return (SetFilePointerEx(fd, li, NULL, FILE_BEGIN) && SetEndOfFile(fd)) + ? MDBX_SUCCESS + : (int)GetLastError(); + } +#else + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + return ftruncate(fd, length) == 0 ? MDBX_SUCCESS : errno; +#endif +} + +MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos) { +#if defined(_WIN32) || defined(_WIN64) + LARGE_INTEGER li; + li.QuadPart = pos; + return SetFilePointerEx(fd, li, NULL, FILE_BEGIN) ? MDBX_SUCCESS + : (int)GetLastError(); +#else + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + return (lseek(fd, pos, SEEK_SET) < 0) ? errno : MDBX_SUCCESS; +#endif +} + +/*----------------------------------------------------------------------------*/ + +MDBX_INTERNAL_FUNC int +mdbx_thread_create(mdbx_thread_t *thread, + THREAD_RESULT(THREAD_CALL *start_routine)(void *), + void *arg) { +#if defined(_WIN32) || defined(_WIN64) + *thread = CreateThread(NULL, 0, start_routine, arg, 0, NULL); + return *thread ? MDBX_SUCCESS : (int)GetLastError(); +#else + return pthread_create(thread, NULL, start_routine, arg); +#endif +} + +MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread) { +#if defined(_WIN32) || defined(_WIN64) + DWORD code = WaitForSingleObject(thread, INFINITE); + return waitstatus2errcode(code); +#else + void *unused_retval = &unused_retval; + return pthread_join(thread, &unused_retval); +#endif +} + +/*----------------------------------------------------------------------------*/ + +MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, + size_t length, + enum mdbx_syncmode_bits mode_bits) { + uint8_t *ptr = (uint8_t *)map->address + offset; +#if defined(_WIN32) || defined(_WIN64) + if (!FlushViewOfFile(ptr, length)) + return (int)GetLastError(); +#else +#if defined(__linux__) || defined(__gnu_linux__) + if (mode_bits == MDBX_SYNC_NONE && mdbx_linux_kernel_version > 0x02061300) + /* Since Linux 2.6.19, MS_ASYNC is in fact a no-op. The kernel properly + * tracks dirty pages and flushes them to storage as necessary. */ + return MDBX_SUCCESS; +#endif /* Linux */ + if (msync(ptr, length, (mode_bits & MDBX_SYNC_DATA) ? MS_SYNC : MS_ASYNC)) + return errno; + mode_bits &= ~MDBX_SYNC_DATA; +#endif + return mdbx_fsync(map->fd, mode_bits); +} + +MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, + const char *pathname, int err) { +#if defined(_WIN32) || defined(_WIN64) + (void)pathname; + (void)err; + if (!mdbx_GetVolumeInformationByHandleW) + return MDBX_ENOSYS; + DWORD unused, flags; + if (!mdbx_GetVolumeInformationByHandleW(handle, nullptr, 0, nullptr, &unused, + &flags, nullptr, 0)) + return (int)GetLastError(); + if ((flags & FILE_READ_ONLY_VOLUME) == 0) + return MDBX_EACCESS; +#else + struct statvfs info; + if (err != MDBX_ENOFILE) { + if (statvfs(pathname, &info)) + return errno; + if ((info.f_flag & ST_RDONLY) == 0) + return err; + } + if (fstatvfs(handle, &info)) + return errno; + if ((info.f_flag & ST_RDONLY) == 0) + return (err == MDBX_ENOFILE) ? MDBX_EACCESS : err; +#endif /* !Windows */ + return MDBX_SUCCESS; +} + +static int mdbx_check_fs_local(mdbx_filehandle_t handle, int flags) { +#if defined(_WIN32) || defined(_WIN64) + if (mdbx_RunningUnderWine() && !(flags & MDBX_EXCLUSIVE)) + return ERROR_NOT_CAPABLE /* workaround for Wine */; + + if (GetFileType(handle) != FILE_TYPE_DISK) + return ERROR_FILE_OFFLINE; + + if (mdbx_GetFileInformationByHandleEx) { + FILE_REMOTE_PROTOCOL_INFO RemoteProtocolInfo; + if (mdbx_GetFileInformationByHandleEx(handle, FileRemoteProtocolInfo, + &RemoteProtocolInfo, + sizeof(RemoteProtocolInfo))) { + if ((RemoteProtocolInfo.Flags & REMOTE_PROTOCOL_INFO_FLAG_OFFLINE) && + !(flags & MDBX_RDONLY)) + return ERROR_FILE_OFFLINE; + if (!(RemoteProtocolInfo.Flags & REMOTE_PROTOCOL_INFO_FLAG_LOOPBACK) && + !(flags & MDBX_EXCLUSIVE)) + return ERROR_REMOTE_STORAGE_MEDIA_ERROR; + } + } + + if (mdbx_NtFsControlFile) { + NTSTATUS rc; + struct { + WOF_EXTERNAL_INFO wof_info; + union { + WIM_PROVIDER_EXTERNAL_INFO wim_info; + FILE_PROVIDER_EXTERNAL_INFO_V1 file_info; + }; + size_t reserved_for_microsoft_madness[42]; + } GetExternalBacking_OutputBuffer; + IO_STATUS_BLOCK StatusBlock; + rc = mdbx_NtFsControlFile(handle, NULL, NULL, NULL, &StatusBlock, + FSCTL_GET_EXTERNAL_BACKING, NULL, 0, + &GetExternalBacking_OutputBuffer, + sizeof(GetExternalBacking_OutputBuffer)); + if (NT_SUCCESS(rc)) { + if (!(flags & MDBX_EXCLUSIVE)) + return ERROR_REMOTE_STORAGE_MEDIA_ERROR; + } else if (rc != STATUS_OBJECT_NOT_EXTERNALLY_BACKED && + rc != STATUS_INVALID_DEVICE_REQUEST && + rc != STATUS_NOT_SUPPORTED) + return ntstatus2errcode(rc); + } + + if (mdbx_GetVolumeInformationByHandleW && mdbx_GetFinalPathNameByHandleW) { + WCHAR *PathBuffer = mdbx_malloc(sizeof(WCHAR) * INT16_MAX); + if (!PathBuffer) + return MDBX_ENOMEM; + + int rc = MDBX_SUCCESS; + DWORD VolumeSerialNumber, FileSystemFlags; + if (!mdbx_GetVolumeInformationByHandleW(handle, PathBuffer, INT16_MAX, + &VolumeSerialNumber, NULL, + &FileSystemFlags, NULL, 0)) { + rc = (int)GetLastError(); + goto bailout; + } + + if ((flags & MDBX_RDONLY) == 0) { + if (FileSystemFlags & + (FILE_SEQUENTIAL_WRITE_ONCE | FILE_READ_ONLY_VOLUME | + FILE_VOLUME_IS_COMPRESSED)) { + rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR; + goto bailout; + } + } + + if (!mdbx_GetFinalPathNameByHandleW(handle, PathBuffer, INT16_MAX, + FILE_NAME_NORMALIZED | + VOLUME_NAME_NT)) { + rc = (int)GetLastError(); + goto bailout; + } + + if (_wcsnicmp(PathBuffer, L"\\Device\\Mup\\", 12) == 0) { + if (!(flags & MDBX_EXCLUSIVE)) { + rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR; + goto bailout; + } + } else if (mdbx_GetFinalPathNameByHandleW(handle, PathBuffer, INT16_MAX, + FILE_NAME_NORMALIZED | + VOLUME_NAME_DOS)) { + UINT DriveType = GetDriveTypeW(PathBuffer); + if (DriveType == DRIVE_NO_ROOT_DIR && + _wcsnicmp(PathBuffer, L"\\\\?\\", 4) == 0 && + _wcsnicmp(PathBuffer + 5, L":\\", 2) == 0) { + PathBuffer[7] = 0; + DriveType = GetDriveTypeW(PathBuffer + 4); + } + switch (DriveType) { + case DRIVE_CDROM: + if (flags & MDBX_RDONLY) + break; + // fall through + case DRIVE_UNKNOWN: + case DRIVE_NO_ROOT_DIR: + case DRIVE_REMOTE: + default: + if (!(flags & MDBX_EXCLUSIVE)) + rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR; + // fall through + case DRIVE_REMOVABLE: + case DRIVE_FIXED: + case DRIVE_RAMDISK: + break; + } + } + bailout: + mdbx_free(PathBuffer); + return rc; + } + +#else + + struct statvfs statvfs_info; + if (fstatvfs(handle, &statvfs_info)) + return errno; +#if defined(ST_LOCAL) || defined(ST_EXPORTED) + const unsigned long st_flags = statvfs_info.f_flag; +#endif /* ST_LOCAL || ST_EXPORTED */ + +#if defined(__NetBSD__) + const unsigned type = 0; + const char *const name = statvfs_info.f_fstypename; + const size_t name_len = VFS_NAMELEN; +#elif defined(_AIX) || defined(__OS400__) + const char *const name = statvfs_info.f_basetype; + const size_t name_len = sizeof(statvfs_info.f_basetype); + struct stat st; + if (fstat(handle, &st)) + return errno; + const unsigned type = st.st_vfstype; + if ((st.st_flag & FS_REMOTE) != 0 && !(flags & MDBX_EXCLUSIVE)) + return MDBX_EREMOTE; +#elif defined(FSTYPSZ) || defined(_FSTYPSZ) + const unsigned type = 0; + const char *const name = statvfs_info.f_basetype; + const size_t name_len = sizeof(statvfs_info.f_basetype); +#elif defined(__sun) || defined(__SVR4) || defined(__svr4__) || \ + defined(ST_FSTYPSZ) || defined(_ST_FSTYPSZ) + const unsigned type = 0; + struct stat st; + if (fstat(handle, &st)) + return errno; + const char *const name = st.st_fstype; + const size_t name_len = strlen(name); +#else + struct statfs statfs_info; + if (fstatfs(handle, &statfs_info)) + return errno; +#if defined(__OpenBSD__) + const unsigned type = 0; +#else + const unsigned type = statfs_info.f_type; +#endif +#if defined(MNT_LOCAL) || defined(MNT_EXPORTED) + const unsigned long mnt_flags = statfs_info.f_flags; +#endif /* MNT_LOCAL || MNT_EXPORTED */ +#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ + defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) || \ + defined(__APPLE__) || defined(__MACH__) || defined(MFSNAMELEN) || \ + defined(MFSTYPENAMELEN) || defined(VFS_NAMELEN) + const char *const name = statfs_info.f_fstypename; + const size_t name_len = sizeof(statfs_info.f_fstypename); +#elif defined(__ANDROID_API__) && __ANDROID_API__ < 21 + const char *const name = ""; + const unsigned name_len = 0; +#else + + const char *name = ""; + unsigned name_len = 0; + + struct stat st; + if (fstat(handle, &st)) + return errno; + + char pathbuf[PATH_MAX]; + FILE *mounted = nullptr; +#if defined(__linux__) || defined(__gnu_linux__) + mounted = setmntent("/proc/mounts", "r"); +#endif /* Linux */ + if (!mounted) + mounted = setmntent("/etc/mtab", "r"); + if (mounted) { + const struct mntent *ent; +#if defined(_BSD_SOURCE) || defined(_SVID_SOURCE) || defined(__BIONIC__) || \ + (defined(_DEFAULT_SOURCE) && __GLIBC_PREREQ(2, 19)) + struct mntent entbuf; + const bool should_copy = false; + while (nullptr != + (ent = getmntent_r(mounted, &entbuf, pathbuf, sizeof(pathbuf)))) +#else + const bool should_copy = true; + while (nullptr != (ent = getmntent(mounted))) +#endif + { + struct stat mnt; + if (!stat(ent->mnt_dir, &mnt) && mnt.st_dev == st.st_dev) { + if (should_copy) { + name = + strncpy(pathbuf, ent->mnt_fsname, name_len = sizeof(pathbuf) - 1); + pathbuf[name_len] = 0; + } else { + name = ent->mnt_fsname; + name_len = strlen(name); + } + break; + } + } + endmntent(mounted); + } +#endif /* !xBSD && !Android/Bionic */ +#endif + + if (name_len) { + if (((name_len > 2 && strncasecmp("nfs", name, 3) == 0) || + strncasecmp("cifs", name, name_len) == 0 || + strncasecmp("ncpfs", name, name_len) == 0 || + strncasecmp("smbfs", name, name_len) == 0 || + strcasecmp("9P" /* WSL2 */, name) == 0 || + ((name_len > 3 && strncasecmp("fuse", name, 4) == 0) && + strncasecmp("fuseblk", name, name_len) != 0)) && + !(flags & MDBX_EXCLUSIVE)) + return MDBX_EREMOTE; + if (strcasecmp("ftp", name) == 0 || strcasecmp("http", name) == 0 || + strcasecmp("sshfs", name) == 0) + return MDBX_EREMOTE; + } + +#ifdef ST_LOCAL + if ((st_flags & ST_LOCAL) == 0 && !(flags & MDBX_EXCLUSIVE)) + return MDBX_EREMOTE; +#elif defined(MNT_LOCAL) + if ((mnt_flags & MNT_LOCAL) == 0 && !(flags & MDBX_EXCLUSIVE)) + return MDBX_EREMOTE; +#endif /* ST/MNT_LOCAL */ + +#ifdef ST_EXPORTED + if ((st_flags & ST_EXPORTED) != 0 && !(flags & MDBX_RDONLY)) + return MDBX_EREMOTE; +#elif defined(MNT_EXPORTED) + if ((mnt_flags & MNT_EXPORTED) != 0 && !(flags & MDBX_RDONLY)) + return MDBX_EREMOTE; +#endif /* ST/MNT_EXPORTED */ + + switch (type) { + case 0xFF534D42 /* CIFS_MAGIC_NUMBER */: + case 0x6969 /* NFS_SUPER_MAGIC */: + case 0x564c /* NCP_SUPER_MAGIC */: + case 0x517B /* SMB_SUPER_MAGIC */: +#if defined(__digital__) || defined(__osf__) || defined(__osf) + case 0x0E /* Tru64 NFS */: +#endif +#ifdef ST_FST_NFS + case ST_FST_NFS: +#endif + if ((flags & MDBX_EXCLUSIVE) == 0) + return MDBX_EREMOTE; + case 0: + default: + break; + } +#endif /* Unix */ + + return MDBX_SUCCESS; +} + +MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, + const size_t size, const size_t limit, + const unsigned options) { + assert(size <= limit); + map->limit = 0; + map->current = 0; + map->address = nullptr; +#if defined(_WIN32) || defined(_WIN64) + map->section = NULL; + map->filesize = 0; +#endif /* Windows */ + + int err = mdbx_check_fs_local(map->fd, flags); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + if ((flags & MDBX_RDONLY) == 0 && (options & MMAP_OPTION_TRUNCATE) != 0) { + err = mdbx_ftruncate(map->fd, size); + if (err != MDBX_SUCCESS) + return err; +#if defined(_WIN32) || defined(_WIN64) + map->filesize = size; +#else + map->current = size; +#endif /* ! Windows */ + } else { + uint64_t filesize = 0; + err = mdbx_filesize(map->fd, &filesize); + if (err != MDBX_SUCCESS) + return err; +#if defined(_WIN32) || defined(_WIN64) + map->filesize = filesize; +#else + map->current = (filesize > limit) ? limit : (size_t)filesize; +#endif /* ! Windows */ + } + +#if defined(_WIN32) || defined(_WIN64) + LARGE_INTEGER SectionSize; + SectionSize.QuadPart = size; + err = NtCreateSection( + &map->section, + /* DesiredAccess */ + (flags & MDBX_WRITEMAP) + ? SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE | + SECTION_MAP_WRITE + : SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE, + /* ObjectAttributes */ NULL, /* MaximumSize (InitialSize) */ &SectionSize, + /* SectionPageProtection */ + (flags & MDBX_RDONLY) ? PAGE_READONLY : PAGE_READWRITE, + /* AllocationAttributes */ SEC_RESERVE, map->fd); + if (!NT_SUCCESS(err)) + return ntstatus2errcode(err); + + SIZE_T ViewSize = + (flags & MDBX_RDONLY) ? 0 : mdbx_RunningUnderWine() ? size : limit; + err = NtMapViewOfSection( + map->section, GetCurrentProcess(), &map->address, + /* ZeroBits */ 0, + /* CommitSize */ 0, + /* SectionOffset */ NULL, &ViewSize, + /* InheritDisposition */ ViewUnmap, + /* AllocationType */ (flags & MDBX_RDONLY) ? 0 : MEM_RESERVE, + /* Win32Protect */ + (flags & MDBX_WRITEMAP) ? PAGE_READWRITE : PAGE_READONLY); + if (!NT_SUCCESS(err)) { + NtClose(map->section); + map->section = 0; + map->address = nullptr; + return ntstatus2errcode(err); + } + assert(map->address != MAP_FAILED); + + map->current = (size_t)SectionSize.QuadPart; + map->limit = ViewSize; + +#else /* Windows */ + +#ifndef MAP_TRYFIXED +#define MAP_TRYFIXED 0 +#endif + +#ifndef MAP_HASSEMAPHORE +#define MAP_HASSEMAPHORE 0 +#endif + +#ifndef MAP_CONCEAL +#define MAP_CONCEAL 0 +#endif + +#ifndef MAP_NOSYNC +#define MAP_NOSYNC 0 +#endif + +#ifndef MAP_FIXED_NOREPLACE +#define MAP_FIXED_NOREPLACE 0 +#endif + +#ifndef MAP_NORESERVE +#define MAP_NORESERVE 0 +#endif + + map->address = mmap( + NULL, limit, (flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ, + MAP_SHARED | MAP_FILE | MAP_NORESERVE | + (F_ISSET(flags, MDBX_UTTERLY_NOSYNC) ? MAP_NOSYNC : 0) | + ((options & MMAP_OPTION_SEMAPHORE) ? MAP_HASSEMAPHORE | MAP_NOSYNC + : MAP_CONCEAL), + map->fd, 0); + + if (unlikely(map->address == MAP_FAILED)) { + map->limit = 0; + map->current = 0; + map->address = nullptr; + return errno; + } + map->limit = limit; + +#if MDBX_ENABLE_MADVISE +#ifdef MADV_DONTFORK + if (unlikely(madvise(map->address, map->limit, MADV_DONTFORK) != 0)) + return errno; +#endif /* MADV_DONTFORK */ +#ifdef MADV_NOHUGEPAGE + (void)madvise(map->address, map->limit, MADV_NOHUGEPAGE); +#endif /* MADV_NOHUGEPAGE */ +#endif /* MDBX_ENABLE_MADVISE */ + +#endif /* ! Windows */ + + VALGRIND_MAKE_MEM_DEFINED(map->address, map->current); + ASAN_UNPOISON_MEMORY_REGION(map->address, map->current); + return MDBX_SUCCESS; +} + +MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map) { + VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); + /* Unpoisoning is required for ASAN to avoid false-positive diagnostic + * when this memory will re-used by malloc or another mmapping. + * See https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203 */ + ASAN_UNPOISON_MEMORY_REGION(map->address, map->limit); +#if defined(_WIN32) || defined(_WIN64) + if (map->section) + NtClose(map->section); + NTSTATUS rc = NtUnmapViewOfSection(GetCurrentProcess(), map->address); + if (!NT_SUCCESS(rc)) + ntstatus2errcode(rc); +#else + if (unlikely(munmap(map->address, map->limit))) + return errno; +#endif /* ! Windows */ + + map->limit = 0; + map->current = 0; + map->address = nullptr; + return MDBX_SUCCESS; +} + +MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, + size_t size, size_t limit) { + assert(size <= limit); +#if defined(_WIN32) || defined(_WIN64) + assert(size != map->current || limit != map->limit || size < map->filesize); + + NTSTATUS status; + LARGE_INTEGER SectionSize; + int err, rc = MDBX_SUCCESS; + + if (!(flags & MDBX_RDONLY) && limit == map->limit && size > map->current && + /* workaround for Wine */ mdbx_NtExtendSection) { + /* growth rw-section */ + SectionSize.QuadPart = size; + status = mdbx_NtExtendSection(map->section, &SectionSize); + if (!NT_SUCCESS(status)) + return ntstatus2errcode(status); + map->current = size; + if (map->filesize < size) + map->filesize = size; + return MDBX_SUCCESS; + } + + if (limit > map->limit) { + /* check ability of address space for growth before unmap */ + PVOID BaseAddress = (PBYTE)map->address + map->limit; + SIZE_T RegionSize = limit - map->limit; + status = NtAllocateVirtualMemory(GetCurrentProcess(), &BaseAddress, 0, + &RegionSize, MEM_RESERVE, PAGE_NOACCESS); + if (status == (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018) + return MDBX_UNABLE_EXTEND_MAPSIZE; + if (!NT_SUCCESS(status)) + return ntstatus2errcode(status); + + status = NtFreeVirtualMemory(GetCurrentProcess(), &BaseAddress, &RegionSize, + MEM_RELEASE); + if (!NT_SUCCESS(status)) + return ntstatus2errcode(status); + } + + /* Windows unable: + * - shrink a mapped file; + * - change size of mapped view; + * - extend read-only mapping; + * Therefore we should unmap/map entire section. */ + if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) + return MDBX_RESULT_TRUE; + + status = NtUnmapViewOfSection(GetCurrentProcess(), map->address); + if (!NT_SUCCESS(status)) + return ntstatus2errcode(status); + status = NtClose(map->section); + map->section = NULL; + PVOID ReservedAddress = NULL; + SIZE_T ReservedSize = limit; + + if (!NT_SUCCESS(status)) { + bailout_ntstatus: + err = ntstatus2errcode(status); + bailout: + map->address = NULL; + map->current = map->limit = 0; + if (ReservedAddress) { + ReservedSize = 0; + status = NtFreeVirtualMemory(GetCurrentProcess(), &ReservedAddress, + &ReservedSize, MEM_RELEASE); + assert(NT_SUCCESS(status)); + (void)status; + } + return err; + } + +retry_file_and_section: + /* resizing of the file may take a while, + * therefore we reserve address space to avoid occupy it by other threads */ + ReservedAddress = map->address; + status = NtAllocateVirtualMemory(GetCurrentProcess(), &ReservedAddress, 0, + &ReservedSize, MEM_RESERVE, PAGE_NOACCESS); + if (!NT_SUCCESS(status)) { + ReservedAddress = NULL; + if (status != (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018) + goto bailout_ntstatus /* no way to recovery */; + + if (flags & MDBX_MRESIZE_MAY_MOVE) + /* the base address could be changed */ + map->address = NULL; + } + + err = mdbx_filesize(map->fd, &map->filesize); + if (err != MDBX_SUCCESS) + goto bailout; + + if ((flags & MDBX_RDONLY) == 0 && map->filesize != size) { + err = mdbx_ftruncate(map->fd, size); + if (err == MDBX_SUCCESS) + map->filesize = size; + /* ignore error, because Windows unable shrink file + * that already mapped (by another process) */ + } + + SectionSize.QuadPart = size; + status = NtCreateSection( + &map->section, + /* DesiredAccess */ + (flags & MDBX_WRITEMAP) + ? SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE | + SECTION_MAP_WRITE + : SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE, + /* ObjectAttributes */ NULL, + /* MaximumSize (InitialSize) */ &SectionSize, + /* SectionPageProtection */ + (flags & MDBX_RDONLY) ? PAGE_READONLY : PAGE_READWRITE, + /* AllocationAttributes */ SEC_RESERVE, map->fd); + + if (!NT_SUCCESS(status)) + goto bailout_ntstatus; + + if (ReservedAddress) { + /* release reserved address space */ + ReservedSize = 0; + status = NtFreeVirtualMemory(GetCurrentProcess(), &ReservedAddress, + &ReservedSize, MEM_RELEASE); + ReservedAddress = NULL; + if (!NT_SUCCESS(status)) + goto bailout_ntstatus; + } + +retry_mapview:; + SIZE_T ViewSize = (flags & MDBX_RDONLY) ? size : limit; + status = NtMapViewOfSection( + map->section, GetCurrentProcess(), &map->address, + /* ZeroBits */ 0, + /* CommitSize */ 0, + /* SectionOffset */ NULL, &ViewSize, + /* InheritDisposition */ ViewUnmap, + /* AllocationType */ (flags & MDBX_RDONLY) ? 0 : MEM_RESERVE, + /* Win32Protect */ + (flags & MDBX_WRITEMAP) ? PAGE_READWRITE : PAGE_READONLY); + + if (!NT_SUCCESS(status)) { + if (status == (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018 && + map->address && (flags & MDBX_MRESIZE_MAY_MOVE) != 0) { + /* try remap at another base address */ + map->address = NULL; + goto retry_mapview; + } + NtClose(map->section); + map->section = NULL; + + if (map->address && (size != map->current || limit != map->limit)) { + /* try remap with previously size and limit, + * but will return MDBX_UNABLE_EXTEND_MAPSIZE on success */ + rc = (limit > map->limit) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_RESULT_TRUE; + size = map->current; + ReservedSize = limit = map->limit; + goto retry_file_and_section; + } + + /* no way to recovery */ + goto bailout_ntstatus; + } + assert(map->address != MAP_FAILED); + + map->current = (size_t)SectionSize.QuadPart; + map->limit = ViewSize; + +#else /* Windows */ + + uint64_t filesize = 0; + int rc = mdbx_filesize(map->fd, &filesize); + if (rc != MDBX_SUCCESS) + return rc; + + if (flags & MDBX_RDONLY) { + map->current = (filesize > limit) ? limit : (size_t)filesize; + if (map->current != size) + rc = + (size > map->current) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_RESULT_TRUE; + } else if (filesize != size) { + rc = mdbx_ftruncate(map->fd, size); + if (rc != MDBX_SUCCESS) + return rc; + map->current = size; + } + + if (limit == map->limit) + return MDBX_SUCCESS; + + if (limit < map->limit) { + /* unmap an excess at end of mapping. */ + if (unlikely(munmap(map->dxb + limit, map->limit - limit))) + return errno; + map->limit = limit; + return MDBX_SUCCESS; + } + + assert(limit > map->limit); + uint8_t *ptr = MAP_FAILED; + +#if defined(MREMAP_MAYMOVE) + ptr = mremap(map->address, map->limit, limit, + (flags & MDBX_MRESIZE_MAY_MOVE) ? MREMAP_MAYMOVE : 0); + if (ptr == MAP_FAILED) { + const int err = errno; + switch (err) { + default: + return err; + case EAGAIN: + case ENOMEM: + return MDBX_UNABLE_EXTEND_MAPSIZE; + case EFAULT /* MADV_DODUMP / MADV_DONTDUMP are mixed for mmap-range */: + break; + } + } +#endif /* MREMAP_MAYMOVE */ + + const unsigned mmap_flags = + MAP_CONCEAL | MAP_SHARED | MAP_FILE | MAP_NORESERVE | + (F_ISSET(flags, MDBX_UTTERLY_NOSYNC) ? MAP_NOSYNC : 0); + const unsigned mmap_prot = + (flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ; + + if (ptr == MAP_FAILED) { + /* Try to mmap additional space beyond the end of mapping. */ + ptr = mmap(map->dxb + map->limit, limit - map->limit, mmap_prot, + mmap_flags | MAP_FIXED_NOREPLACE, map->fd, map->limit); + if (ptr == map->dxb + map->limit) + ptr = map->dxb; + else if (ptr != MAP_FAILED) { + /* the desired address is busy, unmap unsuitable one */ + if (unlikely(munmap(ptr, limit - map->limit))) + return errno; + ptr = MAP_FAILED; + } else { + const int err = errno; + switch (err) { + default: + return err; + case EAGAIN: + case ENOMEM: + return MDBX_UNABLE_EXTEND_MAPSIZE; + case EEXIST: /* address busy */ + case EINVAL: /* kernel don't support MAP_FIXED_NOREPLACE */ + break; + } + } + } + + if (ptr == MAP_FAILED) { + /* unmap and map again whole region */ + if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) { + /* TODO: Perhaps here it is worth to implement suspend/resume threads + * and perform unmap/map as like for Windows. */ + return MDBX_UNABLE_EXTEND_MAPSIZE; + } + + if (unlikely(munmap(map->address, map->limit))) + return errno; + + ptr = mmap(map->address, limit, mmap_prot, + (flags & MDBX_MRESIZE_MAY_MOVE) + ? mmap_flags + : mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE + : MAP_FIXED), + map->fd, 0); + if (MAP_FIXED_NOREPLACE != 0 && MAP_FIXED_NOREPLACE != MAP_FIXED && + unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) && + errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL) + ptr = mmap(map->address, limit, mmap_prot, mmap_flags | MAP_FIXED, + map->fd, 0); + + if (unlikely(ptr == MAP_FAILED)) { + /* try to restore prev mapping */ + ptr = mmap(map->address, map->limit, mmap_prot, + (flags & MDBX_MRESIZE_MAY_MOVE) + ? mmap_flags + : mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE + : MAP_FIXED), + map->fd, 0); + if (MAP_FIXED_NOREPLACE != 0 && MAP_FIXED_NOREPLACE != MAP_FIXED && + unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) && + errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL) + ptr = mmap(map->address, map->limit, mmap_prot, mmap_flags | MAP_FIXED, + map->fd, 0); + if (unlikely(ptr == MAP_FAILED)) { + VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); + /* Unpoisoning is required for ASAN to avoid false-positive diagnostic + * when this memory will re-used by malloc or another mmapping. + * See https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203 + */ + ASAN_UNPOISON_MEMORY_REGION(map->address, map->limit); + map->limit = 0; + map->current = 0; + map->address = nullptr; + return errno; + } + rc = MDBX_UNABLE_EXTEND_MAPSIZE; + limit = map->limit; + } + } + + assert(ptr && ptr != MAP_FAILED); + if (map->address != ptr) { + VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); + /* Unpoisoning is required for ASAN to avoid false-positive diagnostic + * when this memory will re-used by malloc or another mmapping. + * See https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203 + */ + ASAN_UNPOISON_MEMORY_REGION(map->address, map->limit); + + VALGRIND_MAKE_MEM_DEFINED(ptr, map->current); + ASAN_UNPOISON_MEMORY_REGION(ptr, map->current); + map->address = ptr; + } + map->limit = limit; + +#if MDBX_ENABLE_MADVISE +#ifdef MADV_DONTFORK + if (unlikely(madvise(map->address, map->limit, MADV_DONTFORK) != 0)) + return errno; +#endif /* MADV_DONTFORK */ +#ifdef MADV_NOHUGEPAGE + (void)madvise(map->address, map->limit, MADV_NOHUGEPAGE); +#endif /* MADV_NOHUGEPAGE */ +#endif /* MDBX_ENABLE_MADVISE */ + +#endif /* POSIX / Windows */ + + return rc; +} + +/*----------------------------------------------------------------------------*/ + +MDBX_INTERNAL_FUNC __cold void mdbx_osal_jitter(bool tiny) { + for (;;) { +#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ + defined(__x86_64__) + const unsigned salt = 277u * (unsigned)__rdtsc(); +#else + const unsigned salt = rand(); +#endif + + const unsigned coin = salt % (tiny ? 29u : 43u); + if (coin < 43 / 3) + break; +#if defined(_WIN32) || defined(_WIN64) + SwitchToThread(); + if (coin > 43 * 2 / 3) + Sleep(1); +#else + sched_yield(); + if (coin > 43 * 2 / 3) + usleep(coin); +#endif + } +} + +#if defined(_WIN32) || defined(_WIN64) +#elif defined(__APPLE__) || defined(__MACH__) +#include +#elif defined(__linux__) || defined(__gnu_linux__) +static __cold clockid_t choice_monoclock(void) { + struct timespec probe; +#if defined(CLOCK_BOOTTIME) + if (clock_gettime(CLOCK_BOOTTIME, &probe) == 0) + return CLOCK_BOOTTIME; +#elif defined(CLOCK_MONOTONIC_RAW) + if (clock_gettime(CLOCK_MONOTONIC_RAW, &probe) == 0) + return CLOCK_MONOTONIC_RAW; +#elif defined(CLOCK_MONOTONIC_COARSE) + if (clock_gettime(CLOCK_MONOTONIC_COARSE, &probe) == 0) + return CLOCK_MONOTONIC_COARSE; +#endif + return CLOCK_MONOTONIC; +} +#endif + +/*----------------------------------------------------------------------------*/ + +#if defined(_WIN32) || defined(_WIN64) +static LARGE_INTEGER performance_frequency; +#elif defined(__APPLE__) || defined(__MACH__) +static uint64_t ratio_16dot16_to_monotine; +#endif + +MDBX_INTERNAL_FUNC uint64_t +mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16) { +#if defined(_WIN32) || defined(_WIN64) + if (unlikely(performance_frequency.QuadPart == 0)) + QueryPerformanceFrequency(&performance_frequency); + const uint64_t ratio = performance_frequency.QuadPart; +#elif defined(__APPLE__) || defined(__MACH__) + if (unlikely(ratio_16dot16_to_monotine == 0)) { + mach_timebase_info_data_t ti; + mach_timebase_info(&ti); + ratio_16dot16_to_monotine = UINT64_C(1000000000) * ti.denom / ti.numer; + } + const uint64_t ratio = ratio_16dot16_to_monotine; +#else + const uint64_t ratio = UINT64_C(1000000000); +#endif + const uint64_t ret = (ratio * seconds_16dot16 + 32768) >> 16; + return likely(ret || seconds_16dot16 == 0) ? ret : /* fix underflow */ 1; +} + +MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime) { + static uint64_t limit; + if (unlikely(monotime > limit)) { + if (limit != 0) + return UINT32_MAX; + limit = mdbx_osal_16dot16_to_monotime(UINT32_MAX - 1); + if (monotime > limit) + return UINT32_MAX; + } + const uint32_t ret = +#if defined(_WIN32) || defined(_WIN64) + (uint32_t)((monotime << 16) / performance_frequency.QuadPart); +#elif defined(__APPLE__) || defined(__MACH__) + (uint32_t)((monotime << 16) / ratio_16dot16_to_monotine); +#else + (uint32_t)(monotime * 128 / 1953125); +#endif + return likely(ret || monotime == 0) ? ret : /* fix underflow */ 1; +} + +MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void) { +#if defined(_WIN32) || defined(_WIN64) + LARGE_INTEGER counter; + counter.QuadPart = 0; + QueryPerformanceCounter(&counter); + return counter.QuadPart; +#elif defined(__APPLE__) || defined(__MACH__) + return mach_absolute_time(); +#else + +#if defined(__linux__) || defined(__gnu_linux__) + static clockid_t posix_clockid = -1; + if (unlikely(posix_clockid < 0)) + posix_clockid = choice_monoclock(); +#elif defined(CLOCK_MONOTONIC) +#define posix_clockid CLOCK_MONOTONIC +#else +#define posix_clockid CLOCK_REALTIME +#endif + + struct timespec ts; + if (unlikely(clock_gettime(posix_clockid, &ts) != 0)) { + ts.tv_nsec = 0; + ts.tv_sec = 0; + } + return ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; +#endif +} + +/*----------------------------------------------------------------------------*/ + +static void bootid_shake(bin128_t *p) { + /* Bob Jenkins's PRNG: https://burtleburtle.net/bob/rand/smallprng.html */ + const uint32_t e = p->a - (p->b << 23 | p->b >> 9); + p->a = p->b ^ (p->c << 16 | p->c >> 16); + p->b = p->c + (p->d << 11 | p->d >> 21); + p->c = p->d + e; + p->d = e + p->a; +} + +static void bootid_collect(bin128_t *p, const void *s, size_t n) { + p->y += UINT64_C(64526882297375213); + bootid_shake(p); + for (size_t i = 0; i < n; ++i) { + bootid_shake(p); + p->y ^= UINT64_C(48797879452804441) * ((const uint8_t *)s)[i]; + bootid_shake(p); + p->y += 14621231; + } + bootid_shake(p); + + /* minor non-linear tomfoolery */ + const unsigned z = p->x % 61; + p->y = p->y << z | p->y >> (64 - z); + bootid_shake(p); + bootid_shake(p); + const unsigned q = p->x % 59; + p->y = p->y << q | p->y >> (64 - q); + bootid_shake(p); + bootid_shake(p); + bootid_shake(p); +} + +#if defined(_WIN32) || defined(_WIN64) + +static uint64_t windows_systemtime_ms() { + FILETIME ft; + GetSystemTimeAsFileTime(&ft); + return ((uint64_t)ft.dwHighDateTime << 32 | ft.dwLowDateTime) / 10000ul; +} + +static uint64_t windows_bootime(void) { + unsigned confirmed = 0; + uint64_t boottime = 0; + uint64_t up0 = mdbx_GetTickCount64(); + uint64_t st0 = windows_systemtime_ms(); + for (uint64_t fuse = st0; up0 && st0 < fuse + 1000 * 1000u / 42;) { + YieldProcessor(); + const uint64_t up1 = mdbx_GetTickCount64(); + const uint64_t st1 = windows_systemtime_ms(); + if (st1 > fuse && st1 == st0 && up1 == up0) { + uint64_t diff = st1 - up1; + if (boottime == diff) { + if (++confirmed > 4) + return boottime; + } else { + confirmed = 0; + boottime = diff; + } + fuse = st1; + Sleep(1); + } + st0 = st1; + up0 = up1; + } + return 0; +} + +static LSTATUS mdbx_RegGetValue(HKEY hKey, LPCSTR lpSubKey, LPCSTR lpValue, + PVOID pvData, LPDWORD pcbData) { + LSTATUS rc; + if (!mdbx_RegGetValueA) { + /* an old Windows 2000/XP */ + HKEY hSubKey; + rc = RegOpenKeyA(hKey, lpSubKey, &hSubKey); + if (rc == ERROR_SUCCESS) { + rc = RegQueryValueExA(hSubKey, lpValue, NULL, NULL, pvData, pcbData); + RegCloseKey(hSubKey); + } + return rc; + } + + rc = mdbx_RegGetValueA(hKey, lpSubKey, lpValue, RRF_RT_ANY, NULL, pvData, + pcbData); + if (rc != ERROR_FILE_NOT_FOUND) + return rc; + + rc = mdbx_RegGetValueA(hKey, lpSubKey, lpValue, + RRF_RT_ANY | 0x00010000 /* RRF_SUBKEY_WOW6464KEY */, + NULL, pvData, pcbData); + if (rc != ERROR_FILE_NOT_FOUND) + return rc; + return mdbx_RegGetValueA(hKey, lpSubKey, lpValue, + RRF_RT_ANY | 0x00020000 /* RRF_SUBKEY_WOW6432KEY */, + NULL, pvData, pcbData); +} +#endif + +MDBX_MAYBE_UNUSED static __cold bool +bootid_parse_uuid(bin128_t *s, const void *p, const size_t n) { + if (n > 31) { + unsigned bits = 0; + for (unsigned i = 0; i < n; ++i) /* try parse an UUID in text form */ { + uint8_t c = ((const uint8_t *)p)[i]; + if (c >= '0' && c <= '9') + c -= '0'; + else if (c >= 'a' && c <= 'f') + c -= 'a' - 10; + else if (c >= 'A' && c <= 'F') + c -= 'A' - 10; + else + continue; + assert(c <= 15); + c ^= s->y >> 60; + s->y = s->y << 4 | s->x >> 60; + s->x = s->x << 4 | c; + bits += 4; + } + if (bits > 42 * 3) + /* UUID parsed successfully */ + return true; + } + + if (n > 15) /* is enough handle it as a binary? */ { + if (n == sizeof(bin128_t)) { + bin128_t aligned; + memcpy(&aligned, p, sizeof(bin128_t)); + s->x += aligned.x; + s->y += aligned.y; + } else + bootid_collect(s, p, n); + return true; + } + + if (n) + bootid_collect(s, p, n); + return false; +} + +__cold MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void) { + bin128_t bin = {{0, 0}}; + bool got_machineid = false, got_boottime = false, got_bootseq = false; + +#if defined(__linux__) || defined(__gnu_linux__) + { + const int fd = + open("/proc/sys/kernel/random/boot_id", O_RDONLY | O_NOFOLLOW); + if (fd != -1) { + struct statfs fs; + char buf[42]; + const ssize_t len = + (fstatfs(fd, &fs) == 0 && fs.f_type == /* procfs */ 0x9FA0) + ? read(fd, buf, sizeof(buf)) + : -1; + const int err = close(fd); + assert(err == 0); + (void)err; + if (len > 0 && bootid_parse_uuid(&bin, buf, len)) + return bin; + } + } +#endif /* Linux */ + +#if defined(__APPLE__) || defined(__MACH__) + { + char buf[42]; + size_t len = sizeof(buf); + if (!sysctlbyname("kern.bootsessionuuid", buf, &len, nullptr, 0) && + bootid_parse_uuid(&bin, buf, len)) + return bin; + +#if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && \ + __MAC_OS_X_VERSION_MIN_REQUIRED > 1050 + uuid_t uuid; + struct timespec wait = {0, 1000000000u / 42}; + if (!gethostuuid(uuid, &wait) && + bootid_parse_uuid(&bin, uuid, sizeof(uuid))) + got_machineid = true; +#endif /* > 10.5 */ + + struct timeval boottime; + len = sizeof(boottime); + if (!sysctlbyname("kern.boottime", &boottime, &len, nullptr, 0) && + len == sizeof(boottime) && boottime.tv_sec) + got_boottime = true; + } +#endif /* Apple/Darwin */ + +#if defined(_WIN32) || defined(_WIN64) + { + union buf { + DWORD BootId; + DWORD BaseTime; + SYSTEM_TIMEOFDAY_INFORMATION SysTimeOfDayInfo; + struct { + LARGE_INTEGER BootTime; + LARGE_INTEGER CurrentTime; + LARGE_INTEGER TimeZoneBias; + ULONG TimeZoneId; + ULONG Reserved; + ULONGLONG BootTimeBias; + ULONGLONG SleepTimeBias; + } SysTimeOfDayInfoHacked; + wchar_t MachineGuid[42]; + char DigitalProductId[248]; + } buf; + + static const char HKLM_MicrosoftCryptography[] = + "SOFTWARE\\Microsoft\\Cryptography"; + DWORD len = sizeof(buf); + /* Windows is madness and must die */ + if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_MicrosoftCryptography, + "MachineGuid", &buf.MachineGuid, + &len) == ERROR_SUCCESS && + len > 42 && len < sizeof(buf)) + got_machineid = bootid_parse_uuid(&bin, &buf.MachineGuid, len); + + if (!got_machineid) { + /* again, Windows is madness */ + static const char HKLM_WindowsNT[] = + "SOFTWARE\\Microsoft\\Windows NT\\CurrentVersion"; + static const char HKLM_WindowsNT_DPK[] = + "SOFTWARE\\Microsoft\\Windows " + "NT\\CurrentVersion\\DefaultProductKey"; + static const char HKLM_WindowsNT_DPK2[] = + "SOFTWARE\\Microsoft\\Windows " + "NT\\CurrentVersion\\DefaultProductKey2"; + + len = sizeof(buf); + if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT, + "DigitalProductId", &buf.DigitalProductId, + &len) == ERROR_SUCCESS && + len > 42 && len < sizeof(buf)) { + bootid_collect(&bin, &buf.DigitalProductId, len); + got_machineid = true; + } + len = sizeof(buf); + if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT_DPK, + "DigitalProductId", &buf.DigitalProductId, + &len) == ERROR_SUCCESS && + len > 42 && len < sizeof(buf)) { + bootid_collect(&bin, &buf.DigitalProductId, len); + got_machineid = true; + } + len = sizeof(buf); + if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT_DPK2, + "DigitalProductId", &buf.DigitalProductId, + &len) == ERROR_SUCCESS && + len > 42 && len < sizeof(buf)) { + bootid_collect(&bin, &buf.DigitalProductId, len); + got_machineid = true; + } + } + + static const char HKLM_PrefetcherParams[] = + "SYSTEM\\CurrentControlSet\\Control\\Session Manager\\Memory " + "Management\\PrefetchParameters"; + len = sizeof(buf); + if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_PrefetcherParams, "BootId", + &buf.BootId, &len) == ERROR_SUCCESS && + len > 1 && len < sizeof(buf)) { + bootid_collect(&bin, &buf.BootId, len); + got_bootseq = true; + } + + len = sizeof(buf); + if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_PrefetcherParams, "BaseTime", + &buf.BaseTime, &len) == ERROR_SUCCESS && + len >= sizeof(buf.BaseTime) && buf.BaseTime) { + bootid_collect(&bin, &buf.BaseTime, len); + got_boottime = true; + } + + /* BootTime from SYSTEM_TIMEOFDAY_INFORMATION */ + NTSTATUS status = NtQuerySystemInformation( + 0x03 /* SystemTmeOfDayInformation */, &buf.SysTimeOfDayInfo, + sizeof(buf.SysTimeOfDayInfo), &len); + if (NT_SUCCESS(status) && + len >= offsetof(union buf, SysTimeOfDayInfoHacked.BootTime) + + sizeof(buf.SysTimeOfDayInfoHacked.BootTime) && + buf.SysTimeOfDayInfoHacked.BootTime.QuadPart) { + bootid_collect(&bin, &buf.SysTimeOfDayInfoHacked.BootTime, + sizeof(buf.SysTimeOfDayInfoHacked.BootTime)); + got_boottime = true; + } + + if (!got_boottime) { + uint64_t boottime = windows_bootime(); + if (boottime) { + bootid_collect(&bin, &boottime, sizeof(boottime)); + got_boottime = true; + } + } + } +#endif /* Windows */ + +#if defined(CTL_HW) && defined(HW_UUID) + if (!got_machineid) { + static const int mib[] = {CTL_HW, HW_UUID}; + char buf[42]; + size_t len = sizeof(buf); + if (sysctl( +#ifdef SYSCTL_LEGACY_NONCONST_MIB + (int *) +#endif + mib, + ARRAY_LENGTH(mib), &buf, &len, NULL, 0) == 0) + got_machineid = bootid_parse_uuid(&bin, buf, len); + } +#endif /* CTL_HW && HW_UUID */ + +#if defined(CTL_KERN) && defined(KERN_HOSTUUID) + if (!got_machineid) { + static const int mib[] = {CTL_KERN, KERN_HOSTUUID}; + char buf[42]; + size_t len = sizeof(buf); + if (sysctl( +#ifdef SYSCTL_LEGACY_NONCONST_MIB + (int *) +#endif + mib, + ARRAY_LENGTH(mib), &buf, &len, NULL, 0) == 0) + got_machineid = bootid_parse_uuid(&bin, buf, len); + } +#endif /* CTL_KERN && KERN_HOSTUUID */ + +#if defined(__NetBSD__) + if (!got_machineid) { + char buf[42]; + size_t len = sizeof(buf); + if (sysctlbyname("machdep.dmi.system-uuid", buf, &len, NULL, 0) == 0) + got_machineid = bootid_parse_uuid(&bin, buf, len); + } +#endif /* __NetBSD__ */ + +#if _XOPEN_SOURCE_EXTENDED + if (!got_machineid) { + const int hostid = gethostid(); + if (hostid > 0) { + bootid_collect(&bin, &hostid, sizeof(hostid)); + got_machineid = true; + } + } +#endif /* _XOPEN_SOURCE_EXTENDED */ + + if (!got_machineid) { + lack: + bin.x = bin.y = 0; + return bin; + } + + /*--------------------------------------------------------------------------*/ + +#if defined(CTL_KERN) && defined(KERN_BOOTTIME) + if (!got_boottime) { + static const int mib[] = {CTL_KERN, KERN_BOOTTIME}; + struct timeval boottime; + size_t len = sizeof(boottime); + if (sysctl( +#ifdef SYSCTL_LEGACY_NONCONST_MIB + (int *) +#endif + mib, + ARRAY_LENGTH(mib), &boottime, &len, NULL, 0) == 0 && + len == sizeof(boottime) && boottime.tv_sec) { + bootid_collect(&bin, &boottime, len); + got_boottime = true; + } + } +#endif /* CTL_KERN && KERN_BOOTTIME */ + +#if defined(__sun) || defined(__SVR4) || defined(__svr4__) + if (!got_boottime) { + kstat_ctl_t *kc = kstat_open(); + if (kc) { + kstat_t *kp = kstat_lookup(kc, "unix", 0, "system_misc"); + if (kp && kstat_read(kc, kp, 0) != -1) { + kstat_named_t *kn = (kstat_named_t *)kstat_data_lookup(kp, "boot_time"); + if (kn) { + switch (kn->data_type) { + case KSTAT_DATA_INT32: + case KSTAT_DATA_UINT32: + bootid_collect(&bin, &kn->value, sizeof(int32_t)); + got_boottime = true; + case KSTAT_DATA_INT64: + case KSTAT_DATA_UINT64: + bootid_collect(&bin, &kn->value, sizeof(int64_t)); + got_boottime = true; + } + } + } + kstat_close(kc); + } + } +#endif /* SunOS / Solaris */ + +#if _XOPEN_SOURCE_EXTENDED && defined(BOOT_TIME) + if (!got_boottime) { + setutxent(); + const struct utmpx id = {.ut_type = BOOT_TIME}; + const struct utmpx *entry = getutxid(&id); + if (entry) { + bootid_collect(&bin, entry, sizeof(*entry)); + got_boottime = true; + while (unlikely((entry = getutxid(&id)) != nullptr)) { + /* have multiple reboot records, assuming we can distinguish next + * bootsession even if RTC is wrong or absent */ + bootid_collect(&bin, entry, sizeof(*entry)); + got_bootseq = true; + } + } + endutxent(); + } +#endif /* _XOPEN_SOURCE_EXTENDED && BOOT_TIME */ + + if (!got_bootseq) { + if (!got_boottime || !MDBX_TRUST_RTC) + goto lack; + +#if defined(_WIN32) || defined(_WIN64) + FILETIME now; + GetSystemTimeAsFileTime(&now); + if (0x1CCCCCC > now.dwHighDateTime) +#else + struct timespec mono, real; + if (clock_gettime(CLOCK_MONOTONIC, &mono) || + clock_gettime(CLOCK_REALTIME, &real) || + /* wrong time, RTC is mad or absent */ + 1555555555l > real.tv_sec || + /* seems no adjustment by RTC/NTP, i.e. a fake time */ + real.tv_sec < mono.tv_sec || 1234567890l > real.tv_sec - mono.tv_sec || + (real.tv_sec - mono.tv_sec) % 900u == 0) +#endif + goto lack; + } + + return bin; +} + +__cold int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages, + intptr_t *avail_pages) { + if (!page_size && !total_pages && !avail_pages) + return MDBX_EINVAL; + if (total_pages) + *total_pages = -1; + if (avail_pages) + *avail_pages = -1; + + const intptr_t pagesize = mdbx_syspagesize(); + if (page_size) + *page_size = pagesize; + if (unlikely(pagesize < MIN_PAGESIZE || !is_powerof2(pagesize))) + return MDBX_INCOMPATIBLE; + + MDBX_MAYBE_UNUSED const int log2page = log2n_powerof2(pagesize); + assert(pagesize == (INT64_C(1) << log2page)); + (void)log2page; + +#if defined(_WIN32) || defined(_WIN64) + MEMORYSTATUSEX info; + memset(&info, 0, sizeof(info)); + info.dwLength = sizeof(info); + if (!GlobalMemoryStatusEx(&info)) + return (int)GetLastError(); +#endif + + if (total_pages) { +#if defined(_WIN32) || defined(_WIN64) + const intptr_t total_ram_pages = (intptr_t)(info.ullTotalPhys >> log2page); +#elif defined(_SC_PHYS_PAGES) + const intptr_t total_ram_pages = sysconf(_SC_PHYS_PAGES); + if (total_ram_pages == -1) + return errno; +#elif defined(_SC_AIX_REALMEM) + const intptr_t total_ram_Kb = sysconf(_SC_AIX_REALMEM); + if (total_ram_Kb == -1) + return errno; + const intptr_t total_ram_pages = (total_ram_Kb << 10) >> log2page; +#elif defined(HW_USERMEM) || defined(HW_PHYSMEM64) || defined(HW_MEMSIZE) || \ + defined(HW_PHYSMEM) + size_t ram, len = sizeof(ram); + static const int mib[] = { + CTL_HW, +#if defined(HW_USERMEM) + HW_USERMEM +#elif defined(HW_PHYSMEM64) + HW_PHYSMEM64 +#elif defined(HW_MEMSIZE) + HW_MEMSIZE +#else + HW_PHYSMEM +#endif + }; + if (sysctl( +#ifdef SYSCTL_LEGACY_NONCONST_MIB + (int *) +#endif + mib, + ARRAY_LENGTH(mib), &ram, &len, NULL, 0) != 0) + return errno; + if (len != sizeof(ram)) + return MDBX_ENOSYS; + const intptr_t total_ram_pages = (intptr_t)(ram >> log2page); +#else +#error "FIXME: Get User-accessible or physical RAM" +#endif + *total_pages = total_ram_pages; + if (total_ram_pages < 1) + return MDBX_ENOSYS; + } + + if (avail_pages) { +#if defined(_WIN32) || defined(_WIN64) + const intptr_t avail_ram_pages = (intptr_t)(info.ullAvailPhys >> log2page); +#elif defined(_SC_AVPHYS_PAGES) + const intptr_t avail_ram_pages = sysconf(_SC_AVPHYS_PAGES); + if (avail_ram_pages == -1) + return errno; +#elif defined(__MACH__) + mach_msg_type_number_t count = HOST_VM_INFO_COUNT; + vm_statistics_data_t vmstat; + mach_port_t mport = mach_host_self(); + kern_return_t kerr = host_statistics(mach_host_self(), HOST_VM_INFO, + (host_info_t)&vmstat, &count); + mach_port_deallocate(mach_task_self(), mport); + if (unlikely(kerr != KERN_SUCCESS)) + return MDBX_ENOSYS; + const intptr_t avail_ram_pages = vmstat.free_count; +#elif defined(VM_TOTAL) || defined(VM_METER) + struct vmtotal info; + size_t len = sizeof(info); + static const int mib[] = { + CTL_VM, +#if defined(VM_TOTAL) + VM_TOTAL +#elif defined(VM_METER) + VM_METER +#endif + }; + if (sysctl( +#ifdef SYSCTL_LEGACY_NONCONST_MIB + (int *) +#endif + mib, + ARRAY_LENGTH(mib), &info, &len, NULL, 0) != 0) + return errno; + if (len != sizeof(info)) + return MDBX_ENOSYS; + const intptr_t avail_ram_pages = info.t_free; +#else +#error "FIXME: Get Available RAM" +#endif + *avail_pages = avail_ram_pages; + if (avail_ram_pages < 1) + return MDBX_ENOSYS; + } + + return MDBX_SUCCESS; +} +/* This is CMake-template for libmdbx's version.c + ******************************************************************************/ + + +#if MDBX_VERSION_MAJOR != 0 || \ + MDBX_VERSION_MINOR != 10 +#error "API version mismatch! Had `git fetch --tags` done?" +#endif + +static const char sourcery[] = STRINGIFY(MDBX_BUILD_SOURCERY); + +__dll_export +#ifdef __attribute_used__ + __attribute_used__ +#elif defined(__GNUC__) || __has_attribute(__used__) + __attribute__((__used__)) +#endif +#ifdef __attribute_externally_visible__ + __attribute_externally_visible__ +#elif (defined(__GNUC__) && !defined(__clang__)) || \ + __has_attribute(__externally_visible__) + __attribute__((__externally_visible__)) +#endif + const struct MDBX_version_info mdbx_version = { + 0, + 10, + 1, + 15, + {"2021-06-18T15:13:51+03:00", "1c2ca15627c5c4e72657c00530c8a9a71ccd5128", "63e7276c7da864d47c004cc959dd8c6b1731c247", + "v0.10.1-15-g63e7276c"}, + sourcery}; + +__dll_export +#ifdef __attribute_used__ + __attribute_used__ +#elif defined(__GNUC__) || __has_attribute(__used__) + __attribute__((__used__)) +#endif +#ifdef __attribute_externally_visible__ + __attribute_externally_visible__ +#elif (defined(__GNUC__) && !defined(__clang__)) || \ + __has_attribute(__externally_visible__) + __attribute__((__externally_visible__)) +#endif + const char *const mdbx_sourcery_anchor = sourcery; +/* + * Copyright 2015-2021 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#if defined(_WIN32) || defined(_WIN64) /* Windows LCK-implementation */ + +/* PREAMBLE FOR WINDOWS: + * + * We are not concerned for performance here. + * If you are running Windows a performance could NOT be the goal. + * Otherwise please use Linux. */ + + +static void mdbx_winnt_import(void); + +#if MDBX_BUILD_SHARED_LIBRARY +#if MDBX_WITHOUT_MSVC_CRT && defined(NDEBUG) +/* DEBUG/CHECKED builds still require MSVC's CRT for runtime checks. + * + * Define dll's entry point only for Release build when NDEBUG is defined and + * MDBX_WITHOUT_MSVC_CRT=ON. if the entry point isn't defined then MSVC's will + * automatically use DllMainCRTStartup() from CRT library, which also + * automatically call DllMain() from our mdbx.dll */ +#pragma comment(linker, "/ENTRY:DllMain") +#endif /* MDBX_WITHOUT_MSVC_CRT */ + +BOOL APIENTRY DllMain(HANDLE module, DWORD reason, LPVOID reserved) +#else +#if !MDBX_MANUAL_MODULE_HANDLER +static +#endif /* !MDBX_MANUAL_MODULE_HANDLER */ + void NTAPI + mdbx_module_handler(PVOID module, DWORD reason, PVOID reserved) +#endif /* MDBX_BUILD_SHARED_LIBRARY */ +{ + (void)reserved; + switch (reason) { + case DLL_PROCESS_ATTACH: + mdbx_winnt_import(); + mdbx_rthc_global_init(); + break; + case DLL_PROCESS_DETACH: + mdbx_rthc_global_dtor(); + break; + + case DLL_THREAD_ATTACH: + break; + case DLL_THREAD_DETACH: + mdbx_rthc_thread_dtor(module); + break; + } +#if MDBX_BUILD_SHARED_LIBRARY + return TRUE; +#endif +} + +#if !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER +/* *INDENT-OFF* */ +/* clang-format off */ +#if defined(_MSC_VER) +# pragma const_seg(push) +# pragma data_seg(push) + +# ifdef _WIN64 + /* kick a linker to create the TLS directory if not already done */ +# pragma comment(linker, "/INCLUDE:_tls_used") + /* Force some symbol references. */ +# pragma comment(linker, "/INCLUDE:mdbx_tls_anchor") + /* specific const-segment for WIN64 */ +# pragma const_seg(".CRT$XLB") + const +# else + /* kick a linker to create the TLS directory if not already done */ +# pragma comment(linker, "/INCLUDE:__tls_used") + /* Force some symbol references. */ +# pragma comment(linker, "/INCLUDE:_mdbx_tls_anchor") + /* specific data-segment for WIN32 */ +# pragma data_seg(".CRT$XLB") +# endif + + __declspec(allocate(".CRT$XLB")) PIMAGE_TLS_CALLBACK mdbx_tls_anchor = mdbx_module_handler; +# pragma data_seg(pop) +# pragma const_seg(pop) + +#elif defined(__GNUC__) +# ifdef _WIN64 + const +# endif + PIMAGE_TLS_CALLBACK mdbx_tls_anchor __attribute__((__section__(".CRT$XLB"), used)) = mdbx_module_handler; +#else +# error FIXME +#endif +/* *INDENT-ON* */ +/* clang-format on */ +#endif /* !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER */ + +/*----------------------------------------------------------------------------*/ + +#define LCK_SHARED 0 +#define LCK_EXCLUSIVE LOCKFILE_EXCLUSIVE_LOCK +#define LCK_WAITFOR 0 +#define LCK_DONTWAIT LOCKFILE_FAIL_IMMEDIATELY + +static __inline BOOL flock(mdbx_filehandle_t fd, DWORD flags, uint64_t offset, + size_t bytes) { + OVERLAPPED ov; + ov.hEvent = 0; + ov.Offset = (DWORD)offset; + ov.OffsetHigh = HIGH_DWORD(offset); + return LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov); +} + +static __inline BOOL funlock(mdbx_filehandle_t fd, uint64_t offset, + size_t bytes) { + return UnlockFile(fd, (DWORD)offset, HIGH_DWORD(offset), (DWORD)bytes, + HIGH_DWORD(bytes)); +} + +/*----------------------------------------------------------------------------*/ +/* global `write` lock for write-txt processing, + * exclusive locking both meta-pages) */ + +#define LCK_MAXLEN (1u + ((~(size_t)0) >> 1)) +#define LCK_META_OFFSET 0 +#define LCK_META_LEN (MAX_PAGESIZE * NUM_METAS) +#define LCK_BODY_OFFSET LCK_META_LEN +#define LCK_BODY_LEN (LCK_MAXLEN - LCK_BODY_OFFSET) +#define LCK_BODY LCK_BODY_OFFSET, LCK_BODY_LEN +#define LCK_WHOLE 0, LCK_MAXLEN + +int mdbx_txn_lock(MDBX_env *env, bool dontwait) { + if (dontwait) { + if (!TryEnterCriticalSection(&env->me_windowsbug_lock)) + return MDBX_BUSY; + } else { + __try { + EnterCriticalSection(&env->me_windowsbug_lock); + } + __except ((GetExceptionCode() == + 0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */) + ? EXCEPTION_EXECUTE_HANDLER + : EXCEPTION_CONTINUE_SEARCH) { + return ERROR_POSSIBLE_DEADLOCK; + } + } + + if ((env->me_flags & MDBX_EXCLUSIVE) || + flock(env->me_lazy_fd, + dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT) + : (LCK_EXCLUSIVE | LCK_WAITFOR), + LCK_BODY)) + return MDBX_SUCCESS; + int rc = (int)GetLastError(); + LeaveCriticalSection(&env->me_windowsbug_lock); + return (!dontwait || rc != ERROR_LOCK_VIOLATION) ? rc : MDBX_BUSY; +} + +void mdbx_txn_unlock(MDBX_env *env) { + int rc = (env->me_flags & MDBX_EXCLUSIVE) + ? TRUE + : funlock(env->me_lazy_fd, LCK_BODY); + LeaveCriticalSection(&env->me_windowsbug_lock); + if (!rc) + mdbx_panic("%s failed: err %u", __func__, (int)GetLastError()); +} + +/*----------------------------------------------------------------------------*/ +/* global `read` lock for readers registration, + * exclusive locking `mti_numreaders` (second) cacheline */ + +#define LCK_LO_OFFSET 0 +#define LCK_LO_LEN offsetof(MDBX_lockinfo, mti_numreaders) +#define LCK_UP_OFFSET LCK_LO_LEN +#define LCK_UP_LEN (sizeof(MDBX_lockinfo) - LCK_UP_OFFSET) +#define LCK_LOWER LCK_LO_OFFSET, LCK_LO_LEN +#define LCK_UPPER LCK_UP_OFFSET, LCK_UP_LEN + +MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env) { + mdbx_srwlock_AcquireShared(&env->me_remap_guard); + if (env->me_lfd == INVALID_HANDLE_VALUE) + return MDBX_SUCCESS; /* readonly database in readonly filesystem */ + + /* transition from S-? (used) to S-E (locked), + * e.g. exclusive lock upper-part */ + if ((env->me_flags & MDBX_EXCLUSIVE) || + flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) + return MDBX_SUCCESS; + + int rc = (int)GetLastError(); + mdbx_srwlock_ReleaseShared(&env->me_remap_guard); + return rc; +} + +MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env) { + if (env->me_lfd != INVALID_HANDLE_VALUE) { + /* transition from S-E (locked) to S-? (used), e.g. unlock upper-part */ + if ((env->me_flags & MDBX_EXCLUSIVE) == 0 && + !funlock(env->me_lfd, LCK_UPPER)) + mdbx_panic("%s failed: err %u", __func__, (int)GetLastError()); + } + mdbx_srwlock_ReleaseShared(&env->me_remap_guard); +} + +MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait) { + return flock(fd, + wait ? LCK_EXCLUSIVE | LCK_WAITFOR + : LCK_EXCLUSIVE | LCK_DONTWAIT, + 0, LCK_MAXLEN) + ? MDBX_SUCCESS + : (int)GetLastError(); +} + +static int suspend_and_append(mdbx_handle_array_t **array, + const DWORD ThreadId) { + const unsigned limit = (*array)->limit; + if ((*array)->count == limit) { + void *ptr = mdbx_realloc( + (limit > ARRAY_LENGTH((*array)->handles)) + ? *array + : /* don't free initial array on the stack */ NULL, + sizeof(mdbx_handle_array_t) + + sizeof(HANDLE) * (limit * 2 - ARRAY_LENGTH((*array)->handles))); + if (!ptr) + return MDBX_ENOMEM; + if (limit == ARRAY_LENGTH((*array)->handles)) + memcpy(ptr, *array, sizeof(mdbx_handle_array_t)); + *array = (mdbx_handle_array_t *)ptr; + (*array)->limit = limit * 2; + } + + HANDLE hThread = OpenThread(THREAD_SUSPEND_RESUME | THREAD_QUERY_INFORMATION, + FALSE, ThreadId); + if (hThread == NULL) + return (int)GetLastError(); + + if (SuspendThread(hThread) == (DWORD)-1) { + int err = (int)GetLastError(); + DWORD ExitCode; + if (err == /* workaround for Win10 UCRT bug */ ERROR_ACCESS_DENIED || + !GetExitCodeThread(hThread, &ExitCode) || ExitCode != STILL_ACTIVE) + err = MDBX_SUCCESS; + CloseHandle(hThread); + return err; + } + + (*array)->handles[(*array)->count++] = hThread; + return MDBX_SUCCESS; +} + +MDBX_INTERNAL_FUNC int +mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { + mdbx_assert(env, (env->me_flags & MDBX_NOTLS) == 0); + const uintptr_t CurrentTid = GetCurrentThreadId(); + int rc; + if (env->me_lck_mmap.lck) { + /* Scan LCK for threads of the current process */ + const MDBX_reader *const begin = env->me_lck_mmap.lck->mti_readers; + const MDBX_reader *const end = + begin + + atomic_load32(&env->me_lck_mmap.lck->mti_numreaders, mo_AcquireRelease); + const uintptr_t WriteTxnOwner = env->me_txn0 ? env->me_txn0->mt_owner : 0; + for (const MDBX_reader *reader = begin; reader < end; ++reader) { + if (reader->mr_pid.weak != env->me_pid || !reader->mr_tid.weak) { + skip_lck: + continue; + } + if (reader->mr_tid.weak == CurrentTid || + reader->mr_tid.weak == WriteTxnOwner) + goto skip_lck; + + rc = suspend_and_append(array, (mdbx_tid_t)reader->mr_tid.weak); + if (rc != MDBX_SUCCESS) { + bailout_lck: + (void)mdbx_resume_threads_after_remap(*array); + return rc; + } + } + if (WriteTxnOwner && WriteTxnOwner != CurrentTid) { + rc = suspend_and_append(array, (mdbx_tid_t)WriteTxnOwner); + if (rc != MDBX_SUCCESS) + goto bailout_lck; + } + } else { + /* Without LCK (i.e. read-only mode). + * Walk through a snapshot of all running threads */ + mdbx_assert(env, env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)); + const HANDLE hSnapshot = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0); + if (hSnapshot == INVALID_HANDLE_VALUE) + return (int)GetLastError(); + + THREADENTRY32 entry; + entry.dwSize = sizeof(THREADENTRY32); + + if (!Thread32First(hSnapshot, &entry)) { + rc = (int)GetLastError(); + bailout_toolhelp: + CloseHandle(hSnapshot); + (void)mdbx_resume_threads_after_remap(*array); + return rc; + } + + do { + if (entry.th32OwnerProcessID != env->me_pid || + entry.th32ThreadID == CurrentTid) + continue; + + rc = suspend_and_append(array, entry.th32ThreadID); + if (rc != MDBX_SUCCESS) + goto bailout_toolhelp; + + } while (Thread32Next(hSnapshot, &entry)); + + rc = (int)GetLastError(); + if (rc != ERROR_NO_MORE_FILES) + goto bailout_toolhelp; + CloseHandle(hSnapshot); + } + + return MDBX_SUCCESS; +} + +MDBX_INTERNAL_FUNC int +mdbx_resume_threads_after_remap(mdbx_handle_array_t *array) { + int rc = MDBX_SUCCESS; + for (unsigned i = 0; i < array->count; ++i) { + const HANDLE hThread = array->handles[i]; + if (ResumeThread(hThread) == (DWORD)-1) { + const int err = (int)GetLastError(); + DWORD ExitCode; + if (err != /* workaround for Win10 UCRT bug */ ERROR_ACCESS_DENIED && + GetExitCodeThread(hThread, &ExitCode) && ExitCode == STILL_ACTIVE) + rc = err; + } + CloseHandle(hThread); + } + return rc; +} + +/*----------------------------------------------------------------------------*/ +/* global `initial` lock for lockfile initialization, + * exclusive/shared locking first cacheline */ + +/* Briefly description of locking schema/algorithm: + * - Windows does not support upgrading or downgrading for file locking. + * - Therefore upgrading/downgrading is emulated by shared and exclusive + * locking of upper and lower halves. + * - In other words, we have FSM with possible 9 states, + * i.e. free/shared/exclusive x free/shared/exclusive == 9. + * Only 6 states of FSM are used, which 2 of ones are transitive. + * + * States: + * ?-? = free, i.e. unlocked + * S-? = used, i.e. shared lock + * E-? = exclusive-read, i.e. operational exclusive + * ?-S + * ?-E = middle (transitive state) + * S-S + * S-E = locked (transitive state) + * E-S + * E-E = exclusive-write, i.e. exclusive due (re)initialization + * + * The mdbx_lck_seize() moves the locking-FSM from the initial free/unlocked + * state to the "exclusive write" (and returns MDBX_RESULT_TRUE) if possible, + * or to the "used" (and returns MDBX_RESULT_FALSE). + * + * The mdbx_lck_downgrade() moves the locking-FSM from "exclusive write" + * state to the "used" (i.e. shared) state. + * + * The mdbx_lck_upgrade() moves the locking-FSM from "used" (i.e. shared) + * state to the "exclusive write" state. + */ + +static void lck_unlock(MDBX_env *env) { + int err; + + if (env->me_lfd != INVALID_HANDLE_VALUE) { + /* double `unlock` for robustly remove overlapped shared/exclusive locks */ + while (funlock(env->me_lfd, LCK_LOWER)) + ; + err = (int)GetLastError(); + assert(err == ERROR_NOT_LOCKED || + (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); + (void)err; + SetLastError(ERROR_SUCCESS); + + while (funlock(env->me_lfd, LCK_UPPER)) + ; + err = (int)GetLastError(); + assert(err == ERROR_NOT_LOCKED || + (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); + (void)err; + SetLastError(ERROR_SUCCESS); + } + + if (env->me_lazy_fd != INVALID_HANDLE_VALUE) { + /* explicitly unlock to avoid latency for other processes (windows kernel + * releases such locks via deferred queues) */ + while (funlock(env->me_lazy_fd, LCK_BODY)) + ; + err = (int)GetLastError(); + assert(err == ERROR_NOT_LOCKED || + (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); + (void)err; + SetLastError(ERROR_SUCCESS); + + while (funlock(env->me_lazy_fd, LCK_WHOLE)) + ; + err = (int)GetLastError(); + assert(err == ERROR_NOT_LOCKED || + (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); + (void)err; + SetLastError(ERROR_SUCCESS); + } +} + +/* Seize state as 'exclusive-write' (E-E and returns MDBX_RESULT_TRUE) + * or as 'used' (S-? and returns MDBX_RESULT_FALSE). + * Otherwise returns an error. */ +static int internal_seize_lck(HANDLE lfd) { + int rc; + assert(lfd != INVALID_HANDLE_VALUE); + + /* 1) now on ?-? (free), get ?-E (middle) */ + mdbx_jitter4testing(false); + if (!flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) { + rc = (int)GetLastError() /* 2) something went wrong, give up */; + mdbx_error("%s, err %u", "?-?(free) >> ?-E(middle)", rc); + return rc; + } + + /* 3) now on ?-E (middle), try E-E (exclusive-write) */ + mdbx_jitter4testing(false); + if (flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) + return MDBX_RESULT_TRUE /* 4) got E-E (exclusive-write), done */; + + /* 5) still on ?-E (middle) */ + rc = (int)GetLastError(); + mdbx_jitter4testing(false); + if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { + /* 6) something went wrong, give up */ + if (!funlock(lfd, LCK_UPPER)) + mdbx_panic("%s(%s) failed: err %u", __func__, "?-E(middle) >> ?-?(free)", + (int)GetLastError()); + return rc; + } + + /* 7) still on ?-E (middle), try S-E (locked) */ + mdbx_jitter4testing(false); + rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) ? MDBX_RESULT_FALSE + : (int)GetLastError(); + + mdbx_jitter4testing(false); + if (rc != MDBX_RESULT_FALSE) + mdbx_error("%s, err %u", "?-E(middle) >> S-E(locked)", rc); + + /* 8) now on S-E (locked) or still on ?-E (middle), + * transition to S-? (used) or ?-? (free) */ + if (!funlock(lfd, LCK_UPPER)) + mdbx_panic("%s(%s) failed: err %u", __func__, + "X-E(locked/middle) >> X-?(used/free)", (int)GetLastError()); + + /* 9) now on S-? (used, DONE) or ?-? (free, FAILURE) */ + return rc; +} + +MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { + int rc; + + assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); + if (env->me_flags & MDBX_EXCLUSIVE) + return MDBX_RESULT_TRUE /* nope since files were must be opened + non-shareable */ + ; + + if (env->me_lfd == INVALID_HANDLE_VALUE) { + /* LY: without-lck mode (e.g. on read-only filesystem) */ + mdbx_jitter4testing(false); + if (!flock(env->me_lazy_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) { + rc = (int)GetLastError(); + mdbx_error("%s, err %u", "without-lck", rc); + return rc; + } + return MDBX_RESULT_FALSE; + } + + rc = internal_seize_lck(env->me_lfd); + mdbx_jitter4testing(false); + if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { + /* Check that another process don't operates in without-lck mode. + * Doing such check by exclusive locking the body-part of db. Should be + * noted: + * - we need an exclusive lock for do so; + * - we can't lock meta-pages, otherwise other process could get an error + * while opening db in valid (non-conflict) mode. */ + if (!flock(env->me_lazy_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) { + rc = (int)GetLastError(); + mdbx_error("%s, err %u", "lock-against-without-lck", rc); + mdbx_jitter4testing(false); + lck_unlock(env); + } else { + mdbx_jitter4testing(false); + if (!funlock(env->me_lazy_fd, LCK_BODY)) + mdbx_panic("%s(%s) failed: err %u", __func__, + "unlock-against-without-lck", (int)GetLastError()); + } + } + + return rc; +} + +MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { + /* Transite from exclusive-write state (E-E) to used (S-?) */ + assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); + assert(env->me_lfd != INVALID_HANDLE_VALUE); + + if (env->me_flags & MDBX_EXCLUSIVE) + return MDBX_SUCCESS /* nope since files were must be opened non-shareable */ + ; + /* 1) now at E-E (exclusive-write), transition to ?_E (middle) */ + if (!funlock(env->me_lfd, LCK_LOWER)) + mdbx_panic("%s(%s) failed: err %u", __func__, + "E-E(exclusive-write) >> ?-E(middle)", (int)GetLastError()); + + /* 2) now at ?-E (middle), transition to S-E (locked) */ + if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) { + int rc = (int)GetLastError() /* 3) something went wrong, give up */; + mdbx_error("%s, err %u", "?-E(middle) >> S-E(locked)", rc); + return rc; + } + + /* 4) got S-E (locked), continue transition to S-? (used) */ + if (!funlock(env->me_lfd, LCK_UPPER)) + mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> S-?(used)", + (int)GetLastError()); + + return MDBX_SUCCESS /* 5) now at S-? (used), done */; +} + +MDBX_INTERNAL_FUNC int mdbx_lck_upgrade(MDBX_env *env) { + /* Transite from used state (S-?) to exclusive-write (E-E) */ + assert(env->me_lfd != INVALID_HANDLE_VALUE); + + if (env->me_flags & MDBX_EXCLUSIVE) + return MDBX_SUCCESS /* nope since files were must be opened non-shareable */ + ; + + int rc; + /* 1) now on S-? (used), try S-E (locked) */ + mdbx_jitter4testing(false); + if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_UPPER)) { + rc = (int)GetLastError() /* 2) something went wrong, give up */; + mdbx_verbose("%s, err %u", "S-?(used) >> S-E(locked)", rc); + return rc; + } + + /* 3) now on S-E (locked), transition to ?-E (middle) */ + if (!funlock(env->me_lfd, LCK_LOWER)) + mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> ?-E(middle)", + (int)GetLastError()); + + /* 4) now on ?-E (middle), try E-E (exclusive-write) */ + mdbx_jitter4testing(false); + if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) { + rc = (int)GetLastError() /* 5) something went wrong, give up */; + mdbx_verbose("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc); + return rc; + } + + return MDBX_SUCCESS /* 6) now at E-E (exclusive-write), done */; +} + +MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, + MDBX_env *inprocess_neighbor, + int global_uniqueness_flag) { + (void)env; + (void)inprocess_neighbor; + (void)global_uniqueness_flag; + return MDBX_SUCCESS; +} + +MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, + MDBX_env *inprocess_neighbor) { + /* LY: should unmap before releasing the locks to avoid race condition and + * STATUS_USER_MAPPED_FILE/ERROR_USER_MAPPED_FILE */ + if (env->me_map) + mdbx_munmap(&env->me_dxb_mmap); + if (env->me_lck_mmap.lck) { + const bool synced = env->me_lck_mmap.lck->mti_unsynced_pages.weak == 0; + mdbx_munmap(&env->me_lck_mmap); + if (synced && !inprocess_neighbor && env->me_lfd != INVALID_HANDLE_VALUE && + mdbx_lck_upgrade(env) == MDBX_SUCCESS) + /* this will fail if LCK is used/mmapped by other process(es) */ + mdbx_ftruncate(env->me_lfd, 0); + } + lck_unlock(env); + return MDBX_SUCCESS; +} + +/*----------------------------------------------------------------------------*/ +/* reader checking (by pid) */ + +MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env) { + (void)env; + return MDBX_SUCCESS; +} + +MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) { + (void)env; + return MDBX_SUCCESS; +} + +/* Checks reader by pid. + * + * Returns: + * MDBX_RESULT_TRUE, if pid is live (unable to acquire lock) + * MDBX_RESULT_FALSE, if pid is dead (lock acquired) + * or otherwise the errcode. */ +MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid) { + (void)env; + HANDLE hProcess = OpenProcess(SYNCHRONIZE, FALSE, pid); + int rc; + if (likely(hProcess)) { + rc = WaitForSingleObject(hProcess, 0); + if (unlikely(rc == (int)WAIT_FAILED)) + rc = (int)GetLastError(); + CloseHandle(hProcess); + } else { + rc = (int)GetLastError(); + } + + switch (rc) { + case ERROR_INVALID_PARAMETER: + /* pid seems invalid */ + return MDBX_RESULT_FALSE; + case WAIT_OBJECT_0: + /* process just exited */ + return MDBX_RESULT_FALSE; + case ERROR_ACCESS_DENIED: + /* The ERROR_ACCESS_DENIED would be returned for CSRSS-processes, etc. + * assume pid exists */ + return MDBX_RESULT_TRUE; + case WAIT_TIMEOUT: + /* pid running */ + return MDBX_RESULT_TRUE; + default: + /* failure */ + return rc; + } +} + +//---------------------------------------------------------------------------- +// Stub for slim read-write lock +// Copyright (C) 1995-2002 Brad Wilson + +static void WINAPI stub_srwlock_Init(MDBX_srwlock *srwl) { + srwl->readerCount = srwl->writerCount = 0; +} + +static void WINAPI stub_srwlock_AcquireShared(MDBX_srwlock *srwl) { + while (true) { + assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); + + // If there's a writer already, spin without unnecessarily + // interlocking the CPUs + if (srwl->writerCount != 0) { + YieldProcessor(); + continue; + } + + // Add to the readers list + _InterlockedIncrement(&srwl->readerCount); + + // Check for writers again (we may have been preempted). If + // there are no writers writing or waiting, then we're done. + if (srwl->writerCount == 0) + break; + + // Remove from the readers list, spin, try again + _InterlockedDecrement(&srwl->readerCount); + YieldProcessor(); + } +} + +static void WINAPI stub_srwlock_ReleaseShared(MDBX_srwlock *srwl) { + assert(srwl->readerCount > 0); + _InterlockedDecrement(&srwl->readerCount); +} + +static void WINAPI stub_srwlock_AcquireExclusive(MDBX_srwlock *srwl) { + while (true) { + assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); + + // If there's a writer already, spin without unnecessarily + // interlocking the CPUs + if (srwl->writerCount != 0) { + YieldProcessor(); + continue; + } + + // See if we can become the writer (expensive, because it inter- + // locks the CPUs, so writing should be an infrequent process) + if (_InterlockedExchange(&srwl->writerCount, 1) == 0) + break; + } + + // Now we're the writer, but there may be outstanding readers. + // Spin until there aren't any more; new readers will wait now + // that we're the writer. + while (srwl->readerCount != 0) { + assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); + YieldProcessor(); + } +} + +static void WINAPI stub_srwlock_ReleaseExclusive(MDBX_srwlock *srwl) { + assert(srwl->writerCount == 1 && srwl->readerCount >= 0); + srwl->writerCount = 0; +} + +static uint64_t WINAPI stub_GetTickCount64(void) { + LARGE_INTEGER Counter, Frequency; + return (QueryPerformanceFrequency(&Frequency) && + QueryPerformanceCounter(&Counter)) + ? Counter.QuadPart * 1000ul / Frequency.QuadPart + : 0; +} + +/*----------------------------------------------------------------------------*/ + +#ifndef xMDBX_ALLOY +MDBX_srwlock_function mdbx_srwlock_Init, mdbx_srwlock_AcquireShared, + mdbx_srwlock_ReleaseShared, mdbx_srwlock_AcquireExclusive, + mdbx_srwlock_ReleaseExclusive; + +MDBX_NtExtendSection mdbx_NtExtendSection; +MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx; +MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW; +MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; +MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle; +MDBX_NtFsControlFile mdbx_NtFsControlFile; +MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; +MDBX_GetTickCount64 mdbx_GetTickCount64; +MDBX_RegGetValueA mdbx_RegGetValueA; +#endif /* xMDBX_ALLOY */ + +static void mdbx_winnt_import(void) { + const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll"); + +#define GET_PROC_ADDR(dll, ENTRY) \ + mdbx_##ENTRY = (MDBX_##ENTRY)GetProcAddress(dll, #ENTRY) + + if (GetProcAddress(hNtdll, "wine_get_version")) { + assert(mdbx_RunningUnderWine()); + } else { + GET_PROC_ADDR(hNtdll, NtFsControlFile); + GET_PROC_ADDR(hNtdll, NtExtendSection); + assert(!mdbx_RunningUnderWine()); + } + + const HINSTANCE hKernel32dll = GetModuleHandleA("kernel32.dll"); + GET_PROC_ADDR(hKernel32dll, GetFileInformationByHandleEx); + GET_PROC_ADDR(hKernel32dll, GetTickCount64); + if (!mdbx_GetTickCount64) + mdbx_GetTickCount64 = stub_GetTickCount64; + if (!mdbx_RunningUnderWine()) { + GET_PROC_ADDR(hKernel32dll, SetFileInformationByHandle); + GET_PROC_ADDR(hKernel32dll, GetVolumeInformationByHandleW); + GET_PROC_ADDR(hKernel32dll, GetFinalPathNameByHandleW); + GET_PROC_ADDR(hKernel32dll, PrefetchVirtualMemory); + } + + const HINSTANCE hAdvapi32dll = GetModuleHandleA("advapi32.dll"); + GET_PROC_ADDR(hAdvapi32dll, RegGetValueA); +#undef GET_PROC_ADDR + + const MDBX_srwlock_function init = + (MDBX_srwlock_function)GetProcAddress(hKernel32dll, "InitializeSRWLock"); + if (init != NULL) { + mdbx_srwlock_Init = init; + mdbx_srwlock_AcquireShared = (MDBX_srwlock_function)GetProcAddress( + hKernel32dll, "AcquireSRWLockShared"); + mdbx_srwlock_ReleaseShared = (MDBX_srwlock_function)GetProcAddress( + hKernel32dll, "ReleaseSRWLockShared"); + mdbx_srwlock_AcquireExclusive = (MDBX_srwlock_function)GetProcAddress( + hKernel32dll, "AcquireSRWLockExclusive"); + mdbx_srwlock_ReleaseExclusive = (MDBX_srwlock_function)GetProcAddress( + hKernel32dll, "ReleaseSRWLockExclusive"); + } else { + mdbx_srwlock_Init = stub_srwlock_Init; + mdbx_srwlock_AcquireShared = stub_srwlock_AcquireShared; + mdbx_srwlock_ReleaseShared = stub_srwlock_ReleaseShared; + mdbx_srwlock_AcquireExclusive = stub_srwlock_AcquireExclusive; + mdbx_srwlock_ReleaseExclusive = stub_srwlock_ReleaseExclusive; + } +} + +#endif /* Windows LCK-implementation */ +/* + * Copyright 2015-2021 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#if !(defined(_WIN32) || defined(_WIN64)) /* !Windows LCK-implementation */ + +#include + +/*----------------------------------------------------------------------------*/ +/* global constructor/destructor */ + +#if defined(__linux__) || defined(__gnu_linux__) + +#include + +#ifndef xMDBX_ALLOY +uint32_t mdbx_linux_kernel_version; +bool mdbx_RunningOnWSL1; +#endif /* xMDBX_ALLOY */ + +static __cold uint8_t probe_for_WSL(const char *tag) { + const char *const WSL = strstr(tag, "WSL"); + if (WSL && WSL[3] >= '2' && WSL[3] <= '9') + return WSL[3] - '0'; + const char *const wsl = strstr(tag, "wsl"); + if (wsl && wsl[3] >= '2' && wsl[3] <= '9') + return wsl[3] - '0'; + if (WSL || wsl || strcasestr(tag, "Microsoft")) + /* Expecting no new kernel within WSL1, either it will explicitly + * marked by an appropriate WSL-version hint. */ + return (mdbx_linux_kernel_version < /* 4.19.x */ 0x04130000) ? 1 : 2; + return 0; +} + +#endif /* Linux */ + +static __cold __attribute__((__constructor__)) void +mdbx_global_constructor(void) { +#if defined(__linux__) || defined(__gnu_linux__) + struct utsname buffer; + if (uname(&buffer) == 0) { + int i = 0; + char *p = buffer.release; + while (*p && i < 4) { + if (*p >= '0' && *p <= '9') { + long number = strtol(p, &p, 10); + if (number > 0) { + if (number > 255) + number = 255; + mdbx_linux_kernel_version += number << (24 - i * 8); + } + ++i; + } else { + ++p; + } + } + /* "Official" way of detecting WSL1 but not WSL2 + * https://github.com/Microsoft/WSL/issues/423#issuecomment-221627364 + * + * WARNING: False negative detection of WSL1 will result in DATA LOSS! + * So, the REQUIREMENTS for this code: + * 1. MUST detect WSL1 without false-negatives. + * 2. DESIRABLE detect WSL2 but without the risk of violating the first. */ + mdbx_RunningOnWSL1 = probe_for_WSL(buffer.version) == 1 || + probe_for_WSL(buffer.sysname) == 1 || + probe_for_WSL(buffer.release) == 1; + } +#endif /* Linux */ + + mdbx_rthc_global_init(); +} + +static __cold __attribute__((__destructor__)) void +mdbx_global_destructor(void) { + mdbx_rthc_global_dtor(); +} + +/*----------------------------------------------------------------------------*/ +/* lck */ + +/* Описание реализации блокировок для POSIX & Linux: + * + * lck-файл отображается в память, в нём организуется таблица читателей и + * размещаются совместно используемые posix-мьютексы (futex). Посредством + * этих мьютексов (см struct MDBX_lockinfo) реализуются: + * - Блокировка таблицы читателей для регистрации, + * т.е. функции mdbx_rdt_lock() и mdbx_rdt_unlock(). + * - Блокировка БД для пишущих транзакций, + * т.е. функции mdbx_txn_lock() и mdbx_txn_unlock(). + * + * Остальной функционал реализуется отдельно посредством файловых блокировок: + * - Первоначальный захват БД в режиме exclusive/shared и последующий перевод + * в операционный режим, функции mdbx_lck_seize() и mdbx_lck_downgrade(). + * - Проверка присутствие процессов-читателей, + * т.е. функции mdbx_rpid_set(), mdbx_rpid_clear() и mdbx_rpid_check(). + * + * Для блокировки файлов используется fcntl(F_SETLK), так как: + * - lockf() оперирует только эксклюзивной блокировкой и требует + * открытия файла в RW-режиме. + * - flock() не гарантирует атомарности при смене блокировок + * и оперирует только всем файлом целиком. + * - Для контроля процессов-читателей используются однобайтовые + * range-блокировки lck-файла посредством fcntl(F_SETLK). При этом + * в качестве позиции используется pid процесса-читателя. + * - Для первоначального захвата и shared/exclusive выполняется блокировка + * основного файла БД и при успехе lck-файла. + * + * ---------------------------------------------------------------------------- + * УДЕРЖИВАЕМЫЕ БЛОКИРОВКИ В ЗАВИСИМОСТИ ОТ РЕЖИМА И СОСТОЯНИЯ + * + * Эксклюзивный режим без lck-файла: + * = заблокирован весь dxb-файл посредством F_RDLCK или F_WRLCK, + * в зависимости от MDBX_RDONLY. + * + * Не-операционный режим на время пере-инициализации и разрушении lck-файла: + * = F_WRLCK блокировка первого байта lck-файла, другие процессы ждут её + * снятия при получении F_RDLCK через F_SETLKW. + * - блокировки dxb-файла могут меняться до снятие эксклюзивной блокировки + * lck-файла: + * + для НЕ-эксклюзивного режима блокировка pid-байта в dxb-файле + * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. + * + для ЭКСКЛЮЗИВНОГО режима блокировка pid-байта всего dxb-файла + * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. + * + * ОПЕРАЦИОННЫЙ режим с lck-файлом: + * = F_RDLCK блокировка первого байта lck-файла, другие процессы не могут + * получить F_WRLCK и таким образом видят что БД используется. + * + F_WRLCK блокировка pid-байта в clk-файле после первой транзакции чтения. + * + для НЕ-эксклюзивного режима блокировка pid-байта в dxb-файле + * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. + * + для ЭКСКЛЮЗИВНОГО режима блокировка pid-байта всего dxb-файла + * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. + */ + +#if MDBX_USE_OFDLOCKS +static int op_setlk, op_setlkw, op_getlk; +static void __cold choice_fcntl() { + assert(!op_setlk && !op_setlkw && !op_getlk); + if ((mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 +#if defined(__linux__) || defined(__gnu_linux__) + && mdbx_linux_kernel_version > + 0x030f0000 /* OFD locks are available since 3.15, but engages here + only for 3.16 and later kernels (i.e. LTS) because + of reliability reasons */ +#endif /* linux */ + ) { + op_setlk = F_OFD_SETLK; + op_setlkw = F_OFD_SETLKW; + op_getlk = F_OFD_GETLK; + return; + } + op_setlk = F_SETLK; + op_setlkw = F_SETLKW; + op_getlk = F_GETLK; +} +#else +#define op_setlk F_SETLK +#define op_setlkw F_SETLKW +#define op_getlk F_GETLK +#endif /* MDBX_USE_OFDLOCKS */ + +#ifndef OFF_T_MAX +#define OFF_T_MAX \ + ((sizeof(off_t) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff) +#endif + +static int lck_op(mdbx_filehandle_t fd, int cmd, int lck, off_t offset, + off_t len) { + mdbx_jitter4testing(true); + for (;;) { + struct flock lock_op; + memset(&lock_op, 0, sizeof(lock_op)); + lock_op.l_type = lck; + lock_op.l_whence = SEEK_SET; + lock_op.l_start = offset; + lock_op.l_len = len; + int rc = fcntl(fd, cmd, &lock_op); + mdbx_jitter4testing(true); + if (rc != -1) { + if (cmd == op_getlk) { + /* Checks reader by pid. Returns: + * MDBX_RESULT_TRUE - if pid is live (reader holds a lock). + * MDBX_RESULT_FALSE - if pid is dead (a lock could be placed). */ + return (lock_op.l_type == F_UNLCK) ? MDBX_RESULT_FALSE + : MDBX_RESULT_TRUE; + } + return MDBX_SUCCESS; + } + rc = errno; +#if MDBX_USE_OFDLOCKS + if (rc == EINVAL && + (cmd == F_OFD_SETLK || cmd == F_OFD_SETLKW || cmd == F_OFD_GETLK)) { + /* fallback to non-OFD locks */ + if (cmd == F_OFD_SETLK) + cmd = F_SETLK; + else if (cmd == F_OFD_SETLKW) + cmd = F_SETLKW; + else + cmd = F_GETLK; + op_setlk = F_SETLK; + op_setlkw = F_SETLKW; + op_getlk = F_GETLK; + continue; + } +#endif /* MDBX_USE_OFDLOCKS */ + if (rc != EINTR || cmd == op_setlkw) { + mdbx_assert(nullptr, MDBX_IS_ERROR(rc)); + return rc; + } + } +} + +MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait) { +#if MDBX_USE_OFDLOCKS + if (unlikely(op_setlk == 0)) + choice_fcntl(); +#endif /* MDBX_USE_OFDLOCKS */ + return lck_op(fd, wait ? op_setlkw : op_setlk, F_WRLCK, 0, OFF_T_MAX); +} + +MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); + assert(env->me_pid > 0); + if (unlikely(mdbx_getpid() != env->me_pid)) + return MDBX_PANIC; + return lck_op(env->me_lfd, op_setlk, F_WRLCK, env->me_pid, 1); +} + +MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); + assert(env->me_pid > 0); + return lck_op(env->me_lfd, op_setlk, F_UNLCK, env->me_pid, 1); +} + +MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); + assert(pid > 0); + return lck_op(env->me_lfd, op_getlk, F_WRLCK, pid, 1); +} + +/*---------------------------------------------------------------------------*/ + +#if MDBX_LOCKING > MDBX_LOCKING_SYSV +MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc) { +#if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + return sem_init(ipc, false, 1) ? errno : 0; +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 + return pthread_mutex_init(ipc, nullptr); +#else +#error "FIXME" +#endif +} + +MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc) { +#if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + return sem_destroy(ipc) ? errno : 0; +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 + return pthread_mutex_destroy(ipc); +#else +#error "FIXME" +#endif +} +#endif /* MDBX_LOCKING > MDBX_LOCKING_SYSV */ + +static int check_fstat(MDBX_env *env) { + struct stat st; + + int rc = MDBX_SUCCESS; + if (fstat(env->me_lazy_fd, &st)) { + rc = errno; + mdbx_error("fstat(%s), err %d", "DXB", rc); + return rc; + } + + if (!S_ISREG(st.st_mode) || st.st_nlink < 1) { +#ifdef EBADFD + rc = EBADFD; +#else + rc = EPERM; +#endif + mdbx_error("%s %s, err %d", "DXB", + (st.st_nlink < 1) ? "file was removed" : "not a regular file", + rc); + return rc; + } + + if (st.st_size < (off_t)(MDBX_MIN_PAGESIZE * NUM_METAS)) { + mdbx_verbose("dxb-file is too short (%u), exclusive-lock needed", + (unsigned)st.st_size); + rc = MDBX_RESULT_TRUE; + } + + //---------------------------------------------------------------------------- + + if (fstat(env->me_lfd, &st)) { + rc = errno; + mdbx_error("fstat(%s), err %d", "LCK", rc); + return rc; + } + + if (!S_ISREG(st.st_mode) || st.st_nlink < 1) { +#ifdef EBADFD + rc = EBADFD; +#else + rc = EPERM; +#endif + mdbx_error("%s %s, err %d", "LCK", + (st.st_nlink < 1) ? "file was removed" : "not a regular file", + rc); + return rc; + } + + /* Checking file size for detect the situation when we got the shared lock + * immediately after mdbx_lck_destroy(). */ + if (st.st_size < (off_t)(sizeof(MDBX_lockinfo) + sizeof(MDBX_reader))) { + mdbx_verbose("lck-file is too short (%u), exclusive-lock needed", + (unsigned)st.st_size); + rc = MDBX_RESULT_TRUE; + } + + return rc; +} + +MDBX_INTERNAL_FUNC int __cold mdbx_lck_seize(MDBX_env *env) { + assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); + if (unlikely(mdbx_getpid() != env->me_pid)) + return MDBX_PANIC; +#if MDBX_USE_OFDLOCKS + if (unlikely(op_setlk == 0)) + choice_fcntl(); +#endif /* MDBX_USE_OFDLOCKS */ + + int rc = MDBX_SUCCESS; +#if defined(__linux__) || defined(__gnu_linux__) + if (unlikely(mdbx_RunningOnWSL1)) { + rc = ENOLCK /* No record locks available */; + mdbx_error("%s, err %u", + "WSL1 (Windows Subsystem for Linux) is mad and trouble-full, " + "injecting failure to avoid data loss", + rc); + return rc; + } +#endif /* Linux */ + + if (env->me_lfd == INVALID_HANDLE_VALUE) { + /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ + rc = + lck_op(env->me_lazy_fd, op_setlk, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX); + if (rc != MDBX_SUCCESS) { + mdbx_error("%s, err %u", "without-lck", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); + return rc; + } + return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */; + } +#if defined(_POSIX_PRIORITY_SCHEDULING) && _POSIX_PRIORITY_SCHEDULING > 0 + sched_yield(); +#endif + +retry: + if (rc == MDBX_RESULT_TRUE) { + rc = lck_op(env->me_lfd, op_setlk, F_UNLCK, 0, 1); + if (rc != MDBX_SUCCESS) { + mdbx_error("%s, err %u", "unlock-before-retry", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); + return rc; + } + } + + /* Firstly try to get exclusive locking. */ + rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1); + if (rc == MDBX_SUCCESS) { + rc = check_fstat(env); + if (MDBX_IS_ERROR(rc)) + return rc; + + continue_dxb_exclusive: + rc = + lck_op(env->me_lazy_fd, op_setlk, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX); + if (rc == MDBX_SUCCESS) + return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */; + + int err = check_fstat(env); + if (MDBX_IS_ERROR(err)) + return err; + + /* the cause may be a collision with POSIX's file-lock recovery. */ + if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || + rc == EDEADLK)) { + mdbx_error("%s, err %u", "dxb-exclusive", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); + return rc; + } + + /* Fallback to lck-shared */ + } else if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || + rc == EWOULDBLOCK || rc == EDEADLK)) { + mdbx_error("%s, err %u", "try-exclusive", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); + return rc; + } + + /* Here could be one of two: + * - mdbx_lck_destroy() from the another process was hold the lock + * during a destruction. + * - either mdbx_lck_seize() from the another process was got the exclusive + * lock and doing initialization. + * For distinguish these cases will use size of the lck-file later. */ + + /* Wait for lck-shared now. */ + /* Here may be await during transient processes, for instance until another + * competing process doesn't call lck_downgrade(). */ + rc = lck_op(env->me_lfd, op_setlkw, F_RDLCK, 0, 1); + if (rc != MDBX_SUCCESS) { + mdbx_error("%s, err %u", "try-shared", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); + return rc; + } + + rc = check_fstat(env); + if (rc == MDBX_RESULT_TRUE) + goto retry; + if (rc != MDBX_SUCCESS) { + mdbx_error("%s, err %u", "lck_fstat", rc); + return rc; + } + + /* got shared, retry exclusive */ + rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1); + if (rc == MDBX_SUCCESS) + goto continue_dxb_exclusive; + + if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || + rc == EDEADLK)) { + mdbx_error("%s, err %u", "try-exclusive", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); + return rc; + } + + /* Lock against another process operating in without-lck or exclusive mode. */ + rc = + lck_op(env->me_lazy_fd, op_setlk, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, env->me_pid, 1); + if (rc != MDBX_SUCCESS) { + mdbx_error("%s, err %u", "lock-against-without-lck", rc); + mdbx_assert(env, MDBX_IS_ERROR(rc)); + return rc; + } + + /* Done: return with shared locking. */ + return MDBX_RESULT_FALSE; +} + +MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); + if (unlikely(mdbx_getpid() != env->me_pid)) + return MDBX_PANIC; + + int rc = MDBX_SUCCESS; + if ((env->me_flags & MDBX_EXCLUSIVE) == 0) { + rc = lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, 0, env->me_pid); + if (rc == MDBX_SUCCESS) + rc = lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, env->me_pid + 1, + OFF_T_MAX - env->me_pid - 1); + } + if (rc == MDBX_SUCCESS) + rc = lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1); + if (unlikely(rc != 0)) { + mdbx_error("%s, err %u", "lck", rc); + assert(MDBX_IS_ERROR(rc)); + } + return rc; +} + +MDBX_INTERNAL_FUNC int __cold mdbx_lck_destroy(MDBX_env *env, + MDBX_env *inprocess_neighbor) { + if (unlikely(mdbx_getpid() != env->me_pid)) + return MDBX_PANIC; + + int rc = MDBX_SUCCESS; + struct stat lck_info; + MDBX_lockinfo *lck = env->me_lck_mmap.lck; + if (env->me_lfd != INVALID_HANDLE_VALUE && !inprocess_neighbor && lck && + /* try get exclusive access */ + lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, OFF_T_MAX) == 0 && + /* if LCK was not removed */ + fstat(env->me_lfd, &lck_info) == 0 && lck_info.st_nlink > 0 && + lck_op(env->me_lazy_fd, op_setlk, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, + OFF_T_MAX) == 0) { + + mdbx_verbose("%p got exclusive, drown locks", (void *)env); +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + if (env->me_sysv_ipc.semid != -1) + rc = semctl(env->me_sysv_ipc.semid, 2, IPC_RMID) ? errno : 0; +#else + rc = mdbx_ipclock_destroy(&lck->mti_rlock); + if (rc == 0) + rc = mdbx_ipclock_destroy(&lck->mti_wlock); +#endif /* MDBX_LOCKING */ + + mdbx_assert(env, rc == 0); + if (rc == 0) { + const bool synced = lck->mti_unsynced_pages.weak == 0; + mdbx_munmap(&env->me_lck_mmap); + if (synced) + rc = ftruncate(env->me_lfd, 0) ? errno : 0; + } + + mdbx_jitter4testing(false); + } + + /* 1) POSIX's fcntl() locks (i.e. when op_setlk == F_SETLK) should be restored + * after file was closed. + * + * 2) File locks would be released (by kernel) while the file-descriptors will + * be closed. But to avoid false-positive EACCESS and EDEADLK from the kernel, + * locks should be released here explicitly with properly order. */ + + /* close dxb and restore lock */ + if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { + if (unlikely(close(env->me_dsync_fd) != 0) && rc == MDBX_SUCCESS) + rc = errno; + env->me_dsync_fd = INVALID_HANDLE_VALUE; + } + if (env->me_lazy_fd != INVALID_HANDLE_VALUE) { + if (unlikely(close(env->me_lazy_fd) != 0) && rc == MDBX_SUCCESS) + rc = errno; + env->me_lazy_fd = INVALID_HANDLE_VALUE; + if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) { + /* restore file-lock */ + rc = lck_op( + inprocess_neighbor->me_lazy_fd, F_SETLKW, + (inprocess_neighbor->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, + (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) + ? 0 + : inprocess_neighbor->me_pid, + (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) ? OFF_T_MAX : 1); + } + } + + /* close clk and restore locks */ + if (env->me_lfd != INVALID_HANDLE_VALUE) { + if (unlikely(close(env->me_lfd) != 0) && rc == MDBX_SUCCESS) + rc = errno; + env->me_lfd = INVALID_HANDLE_VALUE; + if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) { + /* restore file-locks */ + rc = lck_op(inprocess_neighbor->me_lfd, F_SETLKW, F_RDLCK, 0, 1); + if (rc == MDBX_SUCCESS && inprocess_neighbor->me_live_reader) + rc = mdbx_rpid_set(inprocess_neighbor); + } + } + + if (inprocess_neighbor && rc != MDBX_SUCCESS) + inprocess_neighbor->me_flags |= MDBX_FATAL_ERROR; + return rc; +} + +/*---------------------------------------------------------------------------*/ + +MDBX_INTERNAL_FUNC int __cold mdbx_lck_init(MDBX_env *env, + MDBX_env *inprocess_neighbor, + int global_uniqueness_flag) { +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + int semid = -1; + /* don't initialize semaphores twice */ + (void)inprocess_neighbor; + if (global_uniqueness_flag == MDBX_RESULT_TRUE) { + struct stat st; + if (fstat(env->me_lazy_fd, &st)) + return errno; + sysv_retry_create: + semid = semget(env->me_sysv_ipc.key, 2, + IPC_CREAT | IPC_EXCL | + (st.st_mode & (S_IRWXU | S_IRWXG | S_IRWXO))); + if (unlikely(semid == -1)) { + int err = errno; + if (err != EEXIST) + return err; + + /* remove and re-create semaphore set */ + semid = semget(env->me_sysv_ipc.key, 2, 0); + if (semid == -1) { + err = errno; + if (err != ENOENT) + return err; + goto sysv_retry_create; + } + if (semctl(semid, 2, IPC_RMID)) { + err = errno; + if (err != EIDRM) + return err; + } + goto sysv_retry_create; + } + + unsigned short val_array[2] = {1, 1}; + if (semctl(semid, 2, SETALL, val_array)) + return errno; + } else { + semid = semget(env->me_sysv_ipc.key, 2, 0); + if (semid == -1) + return errno; + + /* check read & write access */ + struct semid_ds data[2]; + if (semctl(semid, 2, IPC_STAT, data) || semctl(semid, 2, IPC_SET, data)) + return errno; + } + + env->me_sysv_ipc.semid = semid; + return MDBX_SUCCESS; + +#elif MDBX_LOCKING == MDBX_LOCKING_FUTEX + (void)inprocess_neighbor; + if (global_uniqueness_flag != MDBX_RESULT_TRUE) + return MDBX_SUCCESS; +#error "FIXME: Not implemented" +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + + /* don't initialize semaphores twice */ + (void)inprocess_neighbor; + if (global_uniqueness_flag == MDBX_RESULT_TRUE) { + if (sem_init(&env->me_lck_mmap.lck->mti_rlock, true, 1)) + return errno; + if (sem_init(&env->me_lck_mmap.lck->mti_wlock, true, 1)) + return errno; + } + return MDBX_SUCCESS; + +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 + if (inprocess_neighbor) + return MDBX_SUCCESS /* don't need any initialization for mutexes + if LCK already opened/used inside current process */ + ; + + /* FIXME: Unfortunately, there is no other reliable way but to long testing + * on each platform. On the other hand, behavior like FreeBSD is incorrect + * and we can expect it to be rare. Moreover, even on FreeBSD without + * additional in-process initialization, the probability of an problem + * occurring is vanishingly small, and the symptom is a return of EINVAL + * while locking a mutex. In other words, in the worst case, the problem + * results in an EINVAL error at the start of the transaction, but NOT data + * loss, nor database corruption, nor other fatal troubles. Thus, the code + * below I am inclined to think the workaround for erroneous platforms (like + * FreeBSD), rather than a defect of libmdbx. */ +#if defined(__FreeBSD__) + /* seems that shared mutexes on FreeBSD required in-process initialization */ + (void)global_uniqueness_flag; +#else + /* shared mutexes on many other platforms (including Darwin and Linux's + * futexes) doesn't need any addition in-process initialization */ + if (global_uniqueness_flag != MDBX_RESULT_TRUE) + return MDBX_SUCCESS; +#endif + + pthread_mutexattr_t ma; + int rc = pthread_mutexattr_init(&ma); + if (rc) + return rc; + + rc = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED); + if (rc) + goto bailout; + +#if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 +#if defined(PTHREAD_MUTEX_ROBUST) || defined(pthread_mutexattr_setrobust) + rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); +#elif defined(PTHREAD_MUTEX_ROBUST_NP) || \ + defined(pthread_mutexattr_setrobust_np) + rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP); +#elif _POSIX_THREAD_PROCESS_SHARED < 200809L + rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP); +#else + rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); +#endif + if (rc) + goto bailout; +#endif /* MDBX_LOCKING == MDBX_LOCKING_POSIX2008 */ + +#if defined(_POSIX_THREAD_PRIO_INHERIT) && _POSIX_THREAD_PRIO_INHERIT >= 0 && \ + !defined(MDBX_SAFE4QEMU) + rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_INHERIT); + if (rc == ENOTSUP) + rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_NONE); + if (rc && rc != ENOTSUP) + goto bailout; +#endif /* PTHREAD_PRIO_INHERIT */ + + rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK); + if (rc && rc != ENOTSUP) + goto bailout; + + rc = pthread_mutex_init(&env->me_lck_mmap.lck->mti_rlock, &ma); + if (rc) + goto bailout; + rc = pthread_mutex_init(&env->me_lck_mmap.lck->mti_wlock, &ma); + +bailout: + pthread_mutexattr_destroy(&ma); + return rc; +#else +#error "FIXME" +#endif /* MDBX_LOCKING > 0 */ +} + +static int __cold mdbx_ipclock_failed(MDBX_env *env, mdbx_ipclock_t *ipc, + const int err) { + int rc = err; +#if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 || MDBX_LOCKING == MDBX_LOCKING_SYSV + if (err == EOWNERDEAD) { + /* We own the mutex. Clean up after dead previous owner. */ + + const bool rlocked = ipc == &env->me_lck->mti_rlock; + rc = MDBX_SUCCESS; + if (!rlocked) { + if (unlikely(env->me_txn)) { + /* env is hosed if the dead thread was ours */ + env->me_flags |= MDBX_FATAL_ERROR; + env->me_txn = NULL; + rc = MDBX_PANIC; + } + } + mdbx_warning("%clock owner died, %s", (rlocked ? 'r' : 'w'), + (rc ? "this process' env is hosed" : "recovering")); + + int check_rc = mdbx_cleanup_dead_readers(env, rlocked, NULL); + check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc; + +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + rc = (rc == MDBX_SUCCESS) ? check_rc : rc; +#else +#if defined(PTHREAD_MUTEX_ROBUST) || defined(pthread_mutex_consistent) + int mreco_rc = pthread_mutex_consistent(ipc); +#elif defined(PTHREAD_MUTEX_ROBUST_NP) || defined(pthread_mutex_consistent_np) + int mreco_rc = pthread_mutex_consistent_np(ipc); +#elif _POSIX_THREAD_PROCESS_SHARED < 200809L + int mreco_rc = pthread_mutex_consistent_np(ipc); +#else + int mreco_rc = pthread_mutex_consistent(ipc); +#endif + check_rc = (mreco_rc == 0) ? check_rc : mreco_rc; + + if (unlikely(mreco_rc)) + mdbx_error("lock recovery failed, %s", mdbx_strerror(mreco_rc)); + + rc = (rc == MDBX_SUCCESS) ? check_rc : rc; + if (MDBX_IS_ERROR(rc)) + pthread_mutex_unlock(ipc); +#endif /* MDBX_LOCKING == MDBX_LOCKING_POSIX2008 */ + return rc; + } +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 + (void)ipc; +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + (void)ipc; +#elif MDBX_LOCKING == MDBX_LOCKING_FUTEX +#ifdef _MSC_VER +#pragma message("warning: TODO") +#else +#warning "TODO" +#endif + (void)ipc; +#else +#error "FIXME" +#endif /* MDBX_LOCKING */ + + mdbx_error("mutex (un)lock failed, %s", mdbx_strerror(err)); + if (rc != EDEADLK) + env->me_flags |= MDBX_FATAL_ERROR; + return rc; +} + +static int mdbx_ipclock_lock(MDBX_env *env, mdbx_ipclock_t *ipc, + const bool dont_wait) { +#if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 + int rc = dont_wait ? pthread_mutex_trylock(ipc) : pthread_mutex_lock(ipc); + rc = (rc == EBUSY && dont_wait) ? MDBX_BUSY : rc; +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + int rc = MDBX_SUCCESS; + if (dont_wait) { + if (sem_trywait(ipc)) { + rc = errno; + if (rc == EAGAIN) + rc = MDBX_BUSY; + } + } else if (sem_wait(ipc)) + rc = errno; +#elif MDBX_LOCKING == MDBX_LOCKING_SYSV + struct sembuf op = {.sem_num = (ipc != &env->me_lck->mti_wlock), + .sem_op = -1, + .sem_flg = dont_wait ? IPC_NOWAIT | SEM_UNDO : SEM_UNDO}; + int rc; + if (semop(env->me_sysv_ipc.semid, &op, 1)) { + rc = errno; + if (dont_wait && rc == EAGAIN) + rc = MDBX_BUSY; + } else { + rc = *ipc ? EOWNERDEAD : MDBX_SUCCESS; + *ipc = env->me_pid; + } +#else +#error "FIXME" +#endif /* MDBX_LOCKING */ + + if (unlikely(rc != MDBX_SUCCESS && rc != MDBX_BUSY)) + rc = mdbx_ipclock_failed(env, ipc, rc); + return rc; +} + +static int mdbx_ipclock_unlock(MDBX_env *env, mdbx_ipclock_t *ipc) { +#if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 + int rc = pthread_mutex_unlock(ipc); + (void)env; +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + int rc = sem_post(ipc) ? errno : MDBX_SUCCESS; + (void)env; +#elif MDBX_LOCKING == MDBX_LOCKING_SYSV + if (unlikely(*ipc != (pid_t)env->me_pid)) + return EPERM; + *ipc = 0; + struct sembuf op = {.sem_num = (ipc != &env->me_lck->mti_wlock), + .sem_op = 1, + .sem_flg = SEM_UNDO}; + int rc = semop(env->me_sysv_ipc.semid, &op, 1) ? errno : MDBX_SUCCESS; +#else +#error "FIXME" +#endif /* MDBX_LOCKING */ + return rc; +} + +MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env) { + mdbx_trace("%s", ">>"); + mdbx_jitter4testing(true); + int rc = mdbx_ipclock_lock(env, &env->me_lck->mti_rlock, false); + mdbx_trace("<< rc %d", rc); + return rc; +} + +MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env) { + mdbx_trace("%s", ">>"); + int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_rlock); + mdbx_trace("<< rc %d", rc); + if (unlikely(rc != MDBX_SUCCESS)) + mdbx_panic("%s() failed: err %d\n", __func__, rc); + mdbx_jitter4testing(true); +} + +int mdbx_txn_lock(MDBX_env *env, bool dont_wait) { + mdbx_trace("%swait %s", dont_wait ? "dont-" : "", ">>"); + mdbx_jitter4testing(true); + int rc = mdbx_ipclock_lock(env, &env->me_lck->mti_wlock, dont_wait); + mdbx_trace("<< rc %d", rc); + return MDBX_IS_ERROR(rc) ? rc : MDBX_SUCCESS; +} + +void mdbx_txn_unlock(MDBX_env *env) { + mdbx_trace("%s", ">>"); + int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_wlock); + mdbx_trace("<< rc %d", rc); + if (unlikely(rc != MDBX_SUCCESS)) + mdbx_panic("%s() failed: err %d\n", __func__, rc); + mdbx_jitter4testing(true); +} + +#else +#ifdef _MSC_VER +#pragma warning(disable : 4206) /* nonstandard extension used: translation \ + unit is empty */ +#endif /* _MSC_VER (warnings) */ +#endif /* !Windows LCK-implementation */ diff --git a/mdbx/mdbx.go b/mdbx/mdbx.go index e781458..4691875 100644 --- a/mdbx/mdbx.go +++ b/mdbx/mdbx.go @@ -131,17 +131,13 @@ details about dealing with such situations. package mdbx /* -#cgo CFLAGS: -O2 -g -Wno-deprecated-declarations -pthread -W -Wall -Werror -Wextra -Wpedantic -fPIC -fvisibility=hidden -std=gnu11 -pthread -Wno-error=attributes -Wno-implicit-fallthrough -Wno-unused-function -Wno-unused-parameter -Wno-format-extra-args -Wbad-function-cast -Wno-missing-field-initializers -#cgo LDFLAGS: ${SRCDIR}/dist/mdbx-static.o -*/ -import "C" +#cgo !windows CFLAGS: -O2 -g -DMDBX_BUILD_FLAGS='' -fPIC -fvisibility=hidden -pthread -Wno-error=attributes -W -Wall -Werror -Wextra -Wpedantic -Wno-deprecated-declarations -Wno-format -Wno-implicit-fallthrough -Wno-unused-parameter -Wno-format-extra-args -Wno-missing-field-initializers +#cgo windows CFLAGS: -O2 -g -DMDBX_BUILD_FLAGS='' -fvisibility=hidden -ffast-math -fexceptions -fno-common -W -Wno-deprecated-declarations -Wno-bad-function-cast -Wno-cast-function-type -Wall -Wno-format -Wno-implicit-fallthrough -Wno-unused-parameter -Wno-format-extra-args -Wno-missing-field-initializers -/* - Expiremental try to compile mdbx by cgo - #define MDBX_CONFIG_H "config.h" - #cgo CFLAGS: -DNDEBUG=1 -ULIBMDBX_EXPORTS -std=gnu11 -W -Wall -Werror -Wextra -Wpedantic -Wno-deprecated-declarations -pthread -fPIC -fvisibility=hidden -std=gnu11 -pthread -Wno-error=attributes -Wno-implicit-fallthrough -Wno-unused-function -Wno-unused-parameter -Wno-format-extra-args -Wbad-function-cast -Wno-missing-field-initializers -O2 -g - //cc -ffunction-sections +#cgo windows LDFLAGS: -lntdll +#cgo linux LDFLAGS: -lrt */ +import "C" // Version return the major, minor, and patch version numbers of the LMDB C // library and a string representation of the version. diff --git a/mdbx/mdbx.h b/mdbx/mdbx.h new file mode 100644 index 0000000..7b719f4 --- /dev/null +++ b/mdbx/mdbx.h @@ -0,0 +1,5099 @@ +/** + +_libmdbx_ is an extremely fast, compact, powerful, embedded, +transactional [key-value +store](https://en.wikipedia.org/wiki/Key-value_database) database, with +[permissive license](./LICENSE). _MDBX_ has a specific set of properties and +capabilities, focused on creating unique lightweight solutions with +extraordinary performance. + +_libmdbx_ is superior to [LMDB](https://bit.ly/26ts7tL) in terms of features +and reliability, not inferior in performance. In comparison to LMDB, _libmdbx_ +makes many things just work perfectly, not silently and catastrophically +break down. _libmdbx_ supports Linux, Windows, MacOS, OSX, iOS, Android, +FreeBSD, DragonFly, Solaris, OpenSolaris, OpenIndiana, NetBSD, OpenBSD and other +systems compliant with POSIX.1-2008. + +_The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет хорошо._ + + +\section copyright LICENSE & COPYRIGHT + +\authors Copyright (c) 2015-2021, Leonid Yuriev +and other _libmdbx_ authors: please see [AUTHORS](./AUTHORS) file. + +\copyright Redistribution and use in source and binary forms, with or without +modification, are permitted only as authorized by the OpenLDAP Public License. + +A copy of this license is available in the file LICENSE in the +top-level directory of the distribution or, alternatively, at +. + + --- + +This code is derived from "LMDB engine" written by +Howard Chu (Symas Corporation), which itself derived from btree.c +written by Martin Hedenfalk. + + --- + +Portions Copyright 2011-2015 Howard Chu, Symas Corp. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted only as authorized by the OpenLDAP +Public License. + +A copy of this license is available in the file LICENSE in the +top-level directory of the distribution or, alternatively, at +. + + --- + +Portions Copyright (c) 2009, 2010 Martin Hedenfalk + +Permission to use, copy, modify, and distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +*******************************************************************************/ + +#pragma once +#ifndef LIBMDBX_H +#define LIBMDBX_H + +#ifdef _MSC_VER +#pragma warning(push, 1) +#pragma warning(disable : 4548) /* expression before comma has no effect; \ + expected expression with side - effect */ +#pragma warning(disable : 4530) /* C++ exception handler used, but unwind \ + * semantics are not enabled. Specify /EHsc */ +#pragma warning(disable : 4577) /* 'noexcept' used with no exception handling \ + * mode specified; termination on exception is \ + * not guaranteed. Specify /EHsc */ +#endif /* _MSC_VER (warnings) */ + +/* *INDENT-OFF* */ +/* clang-format off */ + +/** + \file mdbx.h + \brief The libmdbx C API header file + + \defgroup c_api C API + @{ + \defgroup c_err Error handling + \defgroup c_opening Opening & Closing + \defgroup c_transactions Transactions + \defgroup c_dbi Databases + \defgroup c_crud Create/Read/Update/Delete (see Quick Reference in details) + + \details + \anchor c_crud_hints +# Quick Reference for Insert/Update/Delete operations + +Historically, libmdbx inherits the API basis from LMDB, where it is often +difficult to select flags/options and functions for the desired operation. +So it is recommend using this hints. + +## Databases with UNIQUE keys + +In databases created without the \ref MDBX_DUPSORT option, keys are always +unique. Thus always a single value corresponds to the each key, and so there +are only a few cases of changing data. + +| Case | Flags to use | Result | +|---------------------------------------------|---------------------|------------------------| +| _INSERTING_||| +|Key is absent → Insertion |\ref MDBX_NOOVERWRITE|Insertion | +|Key exist → Error since key present |\ref MDBX_NOOVERWRITE|Error \ref MDBX_KEYEXIST and return Present value| +| _UPSERTING_||| +|Key is absent → Insertion |\ref MDBX_UPSERT |Insertion | +|Key exist → Update |\ref MDBX_UPSERT |Update | +| _UPDATING_||| +|Key is absent → Error since no such key |\ref MDBX_CURRENT |Error \ref MDBX_NOTFOUND| +|Key exist → Update |\ref MDBX_CURRENT |Update value | +| _DELETING_||| +|Key is absent → Error since no such key |\ref mdbx_del() or \ref mdbx_replace()|Error \ref MDBX_NOTFOUND| +|Key exist → Delete by key |\ref mdbx_del() with the parameter `data = NULL`|Deletion| +|Key exist → Delete by key with with data matching check|\ref mdbx_del() with the parameter `data` filled with the value which should be match for deletion|Deletion or \ref MDBX_NOTFOUND if the value does not match| +|Delete at the current cursor position |\ref mdbx_cursor_del() with \ref MDBX_CURRENT flag|Deletion| +|Extract (read & delete) value by the key |\ref mdbx_replace() with zero flag and parameter `new_data = NULL`|Returning a deleted value| + + +## Databases with NON-UNIQUE keys + +In databases created with the \ref MDBX_DUPSORT (Sorted Duplicates) option, keys +may be non unique. Such non-unique keys in a key-value database may be treated +as a duplicates or as like a multiple values corresponds to keys. + + +| Case | Flags to use | Result | +|---------------------------------------------|---------------------|------------------------| +| _INSERTING_||| +|Key is absent → Insertion |\ref MDBX_NOOVERWRITE|Insertion| +|Key exist → Needn't to add new values |\ref MDBX_NOOVERWRITE|Error \ref MDBX_KEYEXIST with returning the first value from those already present| +| _UPSERTING_||| +|Key is absent → Insertion |\ref MDBX_UPSERT |Insertion| +|Key exist → Wanna to add new values |\ref MDBX_UPSERT |Add one more value to the key| +|Key exist → Replace all values with a new one|\ref MDBX_UPSERT + \ref MDBX_ALLDUPS|Overwrite by single new value| +| _UPDATING_||| +|Key is absent → Error since no such key |\ref MDBX_CURRENT |Error \ref MDBX_NOTFOUND| +|Key exist, Single value → Update |\ref MDBX_CURRENT |Update single value | +|Key exist, Multiple values → Replace all values with a new one|\ref MDBX_CURRENT + \ref MDBX_ALLDUPS|Overwrite by single new value| +|Key exist, Multiple values → Error since it is unclear which of the values should be updated|\ref mdbx_put() with \ref MDBX_CURRENT|Error \ref MDBX_EMULTIVAL| +|Key exist, Multiple values → Update particular entry of multi-value|\ref mdbx_replace() with \ref MDBX_CURRENT + \ref MDBX_NOOVERWRITE and the parameter `old_value` filled with the value that wanna to update|Update one multi-value entry| +|Key exist, Multiple values → Update the current entry of multi-value|\ref mdbx_cursor_put() with \ref MDBX_CURRENT|Update one multi-value entry| +| _DELETING_||| +|Key is absent → Error since no such key |\ref mdbx_del() or \ref mdbx_replace()|Error \ref MDBX_NOTFOUND| +|Key exist → Delete all values corresponds given key|\ref mdbx_del() with the parameter `data = NULL`|Deletion| +|Key exist → Delete particular value corresponds given key|\ref mdbx_del() with the parameter `data` filled with the value that wanna to delete, or \ref mdbx_replace() with \ref MDBX_CURRENT + \ref MDBX_NOOVERWRITE and the `old_value` parameter filled with the value that wanna to delete and `new_data = NULL`| Deletion or \ref MDBX_NOTFOUND if no such key-value pair| +|Delete one value at the current cursor position|\ref mdbx_cursor_del() with \ref MDBX_CURRENT flag|Deletion only the current entry| +|Delete all values of key at the current cursor position|\ref mdbx_cursor_del() with with \ref MDBX_ALLDUPS flag|Deletion all duplicates of key (all multi-values) at the current cursor position| + + \defgroup c_cursors Cursors + \defgroup c_statinfo Statistics & Information + \defgroup c_settings Settings + \defgroup c_debug Logging and runtime debug + \defgroup c_rqest Range query estimation + \defgroup c_extra Extra operations +*/ + +/* *INDENT-ON* */ +/* clang-format on */ + +#include +#include +#include + +#if defined(_WIN32) || defined(_WIN64) +#include +#include +#ifndef __mode_t_defined +typedef unsigned short mdbx_mode_t; +#else +typedef mode_t mdbx_mode_t; +#endif /* __mode_t_defined */ +typedef HANDLE mdbx_filehandle_t; +typedef DWORD mdbx_pid_t; +typedef DWORD mdbx_tid_t; +#else /* Windows */ +#include /* for error codes */ +#include /* for pthread_t */ +#include /* for pid_t */ +#include /* for struct iovec */ +#define HAVE_STRUCT_IOVEC 1 +typedef int mdbx_filehandle_t; +typedef pid_t mdbx_pid_t; +typedef pthread_t mdbx_tid_t; +typedef mode_t mdbx_mode_t; +#endif /* !Windows */ + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +/** @} close c_api + * \defgroup api_macros Common Macros + * @{ */ + +/*----------------------------------------------------------------------------*/ + +#ifndef __has_attribute +#define __has_attribute(x) (0) +#endif /* __has_attribute */ + +#ifndef __has_cpp_attribute +#define __has_cpp_attribute(x) 0 +#endif /* __has_cpp_attribute */ + +#ifndef __has_feature +#define __has_feature(x) (0) +#endif /* __has_feature */ + +#ifndef __has_extension +#define __has_extension(x) (0) +#endif /* __has_extension */ + +#ifndef __has_builtin +#define __has_builtin(x) (0) +#endif /* __has_builtin */ + +/** Many functions have no effects except the return value and their + * return value depends only on the parameters and/or global variables. + * Such a function can be subject to common subexpression elimination + * and loop optimization just as an arithmetic operator would be. + * These functions should be declared with the attribute pure. */ +#if (defined(__GNUC__) || __has_attribute(__pure__)) && \ + (!defined(__clang__) /* https://bugs.llvm.org/show_bug.cgi?id=43275 */ \ + || !defined(__cplusplus) || !__has_feature(cxx_exceptions)) +#define MDBX_PURE_FUNCTION __attribute__((__pure__)) +#elif defined(_MSC_VER) && !defined(__clang__) && _MSC_VER >= 1920 +#define MDBX_PURE_FUNCTION +#elif defined(__cplusplus) && __has_cpp_attribute(gnu::pure) && \ + (!defined(__clang__) || !__has_feature(cxx_exceptions)) +#define MDBX_PURE_FUNCTION [[gnu::pure]] +#else +#define MDBX_PURE_FUNCTION +#endif /* MDBX_PURE_FUNCTION */ + +/** Like \ref MDBX_PURE_FUNCTION with addition `noexcept` restriction + * that is compatible to CLANG and proposed [[pure]]. */ +#if defined(__GNUC__) || \ + (__has_attribute(__pure__) && __has_attribute(__nothrow__)) +#define MDBX_NOTHROW_PURE_FUNCTION __attribute__((__pure__, __nothrow__)) +#elif defined(_MSC_VER) && !defined(__clang__) && _MSC_VER >= 1920 +#if __has_cpp_attribute(pure) +#define MDBX_NOTHROW_PURE_FUNCTION [[pure]] +#else +#define MDBX_NOTHROW_PURE_FUNCTION +#endif +#elif defined(__cplusplus) && __has_cpp_attribute(gnu::pure) +#if __has_cpp_attribute(gnu::nothrow) +#define MDBX_NOTHROW_PURE_FUNCTION [[gnu::pure, gnu::nothrow]] +#else +#define MDBX_NOTHROW_PURE_FUNCTION [[gnu::pure]] +#endif +#elif defined(__cplusplus) && __has_cpp_attribute(pure) +#define MDBX_NOTHROW_PURE_FUNCTION [[pure]] +#else +#define MDBX_NOTHROW_PURE_FUNCTION +#endif /* MDBX_NOTHROW_PURE_FUNCTION */ + +/** Many functions do not examine any values except their arguments, + * and have no effects except the return value. Basically this is just + * slightly more strict class than the PURE attribute, since function + * is not allowed to read global memory. + * + * Note that a function that has pointer arguments and examines the + * data pointed to must not be declared const. Likewise, a function + * that calls a non-const function usually must not be const. + * It does not make sense for a const function to return void. */ +#if (defined(__GNUC__) || __has_attribute(__pure__)) && \ + (!defined(__clang__) /* https://bugs.llvm.org/show_bug.cgi?id=43275 */ \ + || !defined(__cplusplus) || !__has_feature(cxx_exceptions)) +#define MDBX_CONST_FUNCTION __attribute__((__const__)) +#elif defined(_MSC_VER) && !defined(__clang__) && _MSC_VER >= 1920 +#define MDBX_CONST_FUNCTION MDBX_PURE_FUNCTION +#elif defined(__cplusplus) && __has_cpp_attribute(gnu::const) && \ + (!defined(__clang__) || !__has_feature(cxx_exceptions)) +#define MDBX_CONST_FUNCTION [[gnu::const]] +#else +#define MDBX_CONST_FUNCTION MDBX_PURE_FUNCTION +#endif /* MDBX_CONST_FUNCTION */ + +/** Like \ref MDBX_CONST_FUNCTION with addition `noexcept` restriction + * that is compatible to CLANG and future [[const]]. */ +#if defined(__GNUC__) || \ + (__has_attribute(__const__) && __has_attribute(__nothrow__)) +#define MDBX_NOTHROW_CONST_FUNCTION __attribute__((__const__, __nothrow__)) +#elif defined(_MSC_VER) && !defined(__clang__) && _MSC_VER >= 1920 +#define MDBX_NOTHROW_CONST_FUNCTION MDBX_NOTHROW_PURE_FUNCTION +#elif defined(__cplusplus) && __has_cpp_attribute(gnu::const) +#if __has_cpp_attribute(gnu::nothrow) +#define MDBX_NOTHROW_PURE_FUNCTION [[gnu::const, gnu::nothrow]] +#else +#define MDBX_NOTHROW_PURE_FUNCTION [[gnu::const]] +#endif +#elif defined(__cplusplus) && __has_cpp_attribute(const) +#define MDBX_NOTHROW_CONST_FUNCTION [[const]] +#else +#define MDBX_NOTHROW_CONST_FUNCTION MDBX_NOTHROW_PURE_FUNCTION +#endif /* MDBX_NOTHROW_CONST_FUNCTION */ + +#ifndef MDBX_DEPRECATED /* may be predefined to avoid warnings "deprecated" */ +#ifdef __deprecated +#define MDBX_DEPRECATED __deprecated +#elif defined(__GNUC__) || __has_attribute(__deprecated__) +#define MDBX_DEPRECATED __attribute__((__deprecated__)) +#elif defined(_MSC_VER) +#define MDBX_DEPRECATED __declspec(deprecated) +#else +#define MDBX_DEPRECATED +#endif +#endif /* MDBX_DEPRECATED */ + +#ifndef __dll_export +#if defined(_WIN32) || defined(__CYGWIN__) +#if defined(__GNUC__) || __has_attribute(__dllexport__) +#define __dll_export __attribute__((__dllexport__)) +#elif defined(_MSC_VER) +#define __dll_export __declspec(dllexport) +#else +#define __dll_export +#endif +#elif defined(__GNUC__) || __has_attribute(__visibility__) +#define __dll_export __attribute__((__visibility__("default"))) +#else +#define __dll_export +#endif +#endif /* __dll_export */ + +#ifndef __dll_import +#if defined(_WIN32) || defined(__CYGWIN__) +#if defined(__GNUC__) || __has_attribute(__dllimport__) +#define __dll_import __attribute__((__dllimport__)) +#elif defined(_MSC_VER) +#define __dll_import __declspec(dllimport) +#else +#define __dll_import +#endif +#else +#define __dll_import +#endif +#endif /* __dll_import */ + +/** \brief Auxiliary macro for robustly define the both inline version of API + * function and non-inline fallback dll-exported version for applications linked + * with old version of libmdbx, with a strictly ODR-common implementation. */ +#if defined(LIBMDBX_INTERNALS) && !defined(LIBMDBX_NO_EXPORTS_LEGACY_API) +#define LIBMDBX_INLINE_API(TYPE, NAME, ARGS) \ + /* proto of exported which uses common impl */ LIBMDBX_API TYPE NAME ARGS; \ + /* definition of common impl */ static __inline TYPE __inline_##NAME ARGS +#else +#define LIBMDBX_INLINE_API(TYPE, NAME, ARGS) static __inline TYPE NAME ARGS +#endif /* LIBMDBX_INLINE_API */ + +/*----------------------------------------------------------------------------*/ + +#ifndef __cplusplus +#ifndef bool +#define bool _Bool +#endif +#ifndef true +#define true (1) +#endif +#ifndef false +#define false (0) +#endif +#endif /* bool without __cplusplus */ + +#if !defined(DOXYGEN) && (!defined(__cpp_noexcept_function_type) || \ + __cpp_noexcept_function_type < 201510L) +#define MDBX_CXX17_NOEXCEPT +#else +#define MDBX_CXX17_NOEXCEPT noexcept +#endif /* MDBX_CXX17_NOEXCEPT */ + +/* Workaround for old compilers without properly support for constexpr. */ +#if !defined(__cplusplus) +#define MDBX_CXX01_CONSTEXPR __inline +#define MDBX_CXX01_CONSTEXPR_VAR const +#elif !defined(DOXYGEN) && \ + (!defined(__cpp_constexpr) || __cpp_constexpr < 200704L || \ + (defined(__LCC__) && __LCC__ < 124) || \ + (defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ < 407) && \ + !defined(__clang__) && !defined(__LCC__)) || \ + (defined(_MSC_VER) && _MSC_VER < 1910) || \ + (defined(__clang__) && __clang_major__ < 4)) +#define MDBX_CXX01_CONSTEXPR inline +#define MDBX_CXX01_CONSTEXPR_VAR const +#else +#define MDBX_CXX01_CONSTEXPR constexpr +#define MDBX_CXX01_CONSTEXPR_VAR constexpr +#endif /* MDBX_CXX01_CONSTEXPR */ + +#if !defined(__cplusplus) +#define MDBX_CXX11_CONSTEXPR __inline +#define MDBX_CXX11_CONSTEXPR_VAR const +#elif !defined(DOXYGEN) && \ + (!defined(__cpp_constexpr) || __cpp_constexpr < 201304 || \ + (defined(__LCC__) && __LCC__ < 124) || \ + (defined(__GNUC__) && __GNUC__ < 6 && !defined(__clang__) && \ + !defined(__LCC__)) || \ + (defined(_MSC_VER) && _MSC_VER < 1910) || \ + (defined(__clang__) && __clang_major__ < 5)) +#define MDBX_CXX11_CONSTEXPR inline +#define MDBX_CXX11_CONSTEXPR_VAR const +#else +#define MDBX_CXX11_CONSTEXPR constexpr +#define MDBX_CXX11_CONSTEXPR_VAR constexpr +#endif /* MDBX_CXX11_CONSTEXPR */ + +#if !defined(__cplusplus) +#define MDBX_CXX14_CONSTEXPR __inline +#define MDBX_CXX14_CONSTEXPR_VAR const +#elif defined(DOXYGEN) || \ + defined(__cpp_constexpr) && __cpp_constexpr >= 201304L && \ + ((defined(_MSC_VER) && _MSC_VER >= 1910) || \ + (defined(__clang__) && __clang_major__ > 4) || \ + (defined(__GNUC__) && __GNUC__ > 6) || \ + (!defined(__GNUC__) && !defined(__clang__) && !defined(_MSC_VER))) +#define MDBX_CXX14_CONSTEXPR constexpr +#define MDBX_CXX14_CONSTEXPR_VAR constexpr +#else +#define MDBX_CXX14_CONSTEXPR inline +#define MDBX_CXX14_CONSTEXPR_VAR const +#endif /* MDBX_CXX14_CONSTEXPR */ + +#if defined(__noreturn) +#define MDBX_NORETURN __noreturn +#elif defined(_Noreturn) +#define MDBX_NORETURN _Noreturn +#elif defined(__GNUC__) || __has_attribute(__noreturn__) +#define MDBX_NORETURN __attribute__((__noreturn__)) +#elif defined(_MSC_VER) && !defined(__clang__) +#define MDBX_NORETURN __declspec(noreturn) +#else +#define MDBX_NORETURN +#endif /* MDBX_NORETURN */ + +#ifndef MDBX_PRINTF_ARGS +#if defined(__GNUC__) || __has_attribute(__format__) +#define MDBX_PRINTF_ARGS(format_index, first_arg) \ + __attribute__((__format__(__printf__, format_index, first_arg))) +#else +#define MDBX_PRINTF_ARGS(format_index, first_arg) +#endif +#endif /* MDBX_PRINTF_ARGS */ + +#if defined(DOXYGEN) || (__has_cpp_attribute(maybe_unused) && \ + (defined(__cplusplus) || __STDC_VERSION__ > 202005L)) +#define MDBX_MAYBE_UNUSED [[maybe_unused]] +#elif defined(__GNUC__) || __has_attribute(__unused__) +#define MDBX_MAYBE_UNUSED __attribute__((__unused__)) +#else +#define MDBX_MAYBE_UNUSED +#endif /* MDBX_MAYBE_UNUSED */ + +/* Oh, below are some songs and dances since: + * - C++ requires explicit definition of the necessary operators. + * - the proper implementation of DEFINE_ENUM_FLAG_OPERATORS for C++ required + * the constexpr feature which is broken in most old compilers; + * - DEFINE_ENUM_FLAG_OPERATORS may be defined broken as in the Windows SDK. */ +#ifndef DEFINE_ENUM_FLAG_OPERATORS + +#ifdef __cplusplus +#if !defined(__cpp_constexpr) || __cpp_constexpr < 200704L || \ + (defined(__LCC__) && __LCC__ < 124) || \ + (defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ < 407) && \ + !defined(__clang__) && !defined(__LCC__)) || \ + (defined(_MSC_VER) && _MSC_VER < 1910) || \ + (defined(__clang__) && __clang_major__ < 4) +/* The constexpr feature is not available or (may be) broken */ +#define CONSTEXPR_ENUM_FLAGS_OPERATIONS 0 +#else +/* C always allows these operators for enums */ +#define CONSTEXPR_ENUM_FLAGS_OPERATIONS 1 +#endif /* __cpp_constexpr */ + +/// Define operator overloads to enable bit operations on enum values that are +/// used to define flags (based on Microsoft's DEFINE_ENUM_FLAG_OPERATORS). +#define DEFINE_ENUM_FLAG_OPERATORS(ENUM) \ + extern "C++" { \ + MDBX_CXX01_CONSTEXPR ENUM operator|(ENUM a, ENUM b) { \ + return ENUM(unsigned(a) | unsigned(b)); \ + } \ + MDBX_CXX14_CONSTEXPR ENUM &operator|=(ENUM &a, ENUM b) { return a = a | b; } \ + MDBX_CXX01_CONSTEXPR ENUM operator&(ENUM a, ENUM b) { \ + return ENUM(unsigned(a) & unsigned(b)); \ + } \ + MDBX_CXX01_CONSTEXPR ENUM operator&(ENUM a, unsigned b) { \ + return ENUM(unsigned(a) & b); \ + } \ + MDBX_CXX01_CONSTEXPR ENUM operator&(unsigned a, ENUM b) { \ + return ENUM(a & unsigned(b)); \ + } \ + MDBX_CXX14_CONSTEXPR ENUM &operator&=(ENUM &a, ENUM b) { return a = a & b; } \ + MDBX_CXX14_CONSTEXPR ENUM &operator&=(ENUM &a, unsigned b) { \ + return a = a & b; \ + } \ + MDBX_CXX01_CONSTEXPR unsigned operator~(ENUM a) { return ~unsigned(a); } \ + MDBX_CXX01_CONSTEXPR ENUM operator^(ENUM a, ENUM b) { \ + return ENUM(unsigned(a) ^ unsigned(b)); \ + } \ + MDBX_CXX14_CONSTEXPR ENUM &operator^=(ENUM &a, ENUM b) { return a = a ^ b; } \ + } +#else /* __cplusplus */ +/* nope for C since it always allows these operators for enums */ +#define DEFINE_ENUM_FLAG_OPERATORS(ENUM) +#define CONSTEXPR_ENUM_FLAGS_OPERATIONS 1 +#endif /* !__cplusplus */ + +#elif !defined(CONSTEXPR_ENUM_FLAGS_OPERATIONS) + +#ifdef __cplusplus +/* DEFINE_ENUM_FLAG_OPERATORS may be defined broken as in the Windows SDK */ +#define CONSTEXPR_ENUM_FLAGS_OPERATIONS 0 +#else +/* C always allows these operators for enums */ +#define CONSTEXPR_ENUM_FLAGS_OPERATIONS 1 +#endif + +#endif /* DEFINE_ENUM_FLAG_OPERATORS */ + +/** @} end of Common Macros */ + +/*----------------------------------------------------------------------------*/ + +/** \addtogroup c_api + * @{ */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* MDBX version 0.10.x */ +#define MDBX_VERSION_MAJOR 0 +#define MDBX_VERSION_MINOR 10 + +#ifndef LIBMDBX_API +#if defined(LIBMDBX_EXPORTS) +#define LIBMDBX_API __dll_export +#elif defined(LIBMDBX_IMPORTS) +#define LIBMDBX_API __dll_import +#else +#define LIBMDBX_API +#endif +#endif /* LIBMDBX_API */ + +#ifdef __cplusplus +#if defined(__clang__) || __has_attribute(type_visibility) +#define LIBMDBX_API_TYPE LIBMDBX_API __attribute__((type_visibility("default"))) +#else +#define LIBMDBX_API_TYPE LIBMDBX_API +#endif +#else +#define LIBMDBX_API_TYPE +#endif /* LIBMDBX_API_TYPE */ + +#if defined(LIBMDBX_IMPORTS) +#define LIBMDBX_VERINFO_API __dll_import +#else +#define LIBMDBX_VERINFO_API __dll_export +#endif /* LIBMDBX_VERINFO_API */ + +/** \brief libmdbx version information */ +extern LIBMDBX_VERINFO_API const struct MDBX_version_info { + uint8_t major; /**< Major version number */ + uint8_t minor; /**< Minor version number */ + uint16_t release; /**< Release number of Major.Minor */ + uint32_t revision; /**< Revision number of Release */ + struct { + const char *datetime; /**< committer date, strict ISO-8601 format */ + const char *tree; /**< commit hash (hexadecimal digits) */ + const char *commit; /**< tree hash, i.e. digest of the source code */ + const char *describe; /**< git-describe string */ + } git; /**< source information from git */ + const char *sourcery; /**< sourcery anchor for pinning */ +} /** \brief libmdbx version information */ mdbx_version; + +/** \brief libmdbx build information + * \attention Some strings could be NULL in case no corresponding information + * was provided at build time (i.e. flags). */ +extern LIBMDBX_VERINFO_API const struct MDBX_build_info { + const char *datetime; /**< build timestamp (ISO-8601 or __DATE__ __TIME__) */ + const char *target; /**< cpu/arch-system-config triplet */ + const char *options; /**< mdbx-related options */ + const char *compiler; /**< compiler */ + const char *flags; /**< CFLAGS and CXXFLAGS */ +} /** \brief libmdbx build information */ mdbx_build; + +#if (defined(_WIN32) || defined(_WIN64)) && !MDBX_BUILD_SHARED_LIBRARY +/* MDBX internally uses global and thread local storage destructors to + * automatically (de)initialization, releasing reader lock table slots + * and so on. + * + * If MDBX builded as a DLL this is done out-of-the-box by DllEntry() function, + * which called automatically by Windows core with passing corresponding reason + * argument. + * + * Otherwise, if MDBX was builded not as a DLL, some black magic + * may be required depending of Windows version: + * + * - Modern Windows versions, including Windows Vista and later, provides + * support for "TLS Directory" (e.g .CRT$XL[A-Z] sections in executable + * or dll file). In this case, MDBX capable of doing all automatically, + * therefore you DON'T NEED to call mdbx_module_handler() + * so the MDBX_MANUAL_MODULE_HANDLER defined as 0. + * + * - Obsolete versions of Windows, prior to Windows Vista, REQUIRES calling + * mdbx_module_handler() manually from corresponding DllMain() or WinMain() + * of your DLL or application, + * so the MDBX_MANUAL_MODULE_HANDLER defined as 1. + * + * Therefore, building MDBX as a DLL is recommended for all version of Windows. + * So, if you doubt, just build MDBX as the separate DLL and don't care about + * the MDBX_MANUAL_MODULE_HANDLER. */ + +#ifndef _WIN32_WINNT +#error Non-dll build libmdbx requires target Windows version \ + to be explicitly defined via _WIN32_WINNT for properly \ + handling thread local storage destructors. +#endif /* _WIN32_WINNT */ + +#if _WIN32_WINNT >= 0x0600 /* Windows Vista */ +/* As described above mdbx_module_handler() is NOT needed for Windows Vista + * and later. */ +#define MDBX_MANUAL_MODULE_HANDLER 0 +#else +/* As described above mdbx_module_handler() IS REQUIRED for Windows versions + * prior to Windows Vista. */ +#define MDBX_MANUAL_MODULE_HANDLER 1 +void LIBMDBX_API NTAPI mdbx_module_handler(PVOID module, DWORD reason, + PVOID reserved); +#endif + +#endif /* Windows && !DLL && MDBX_MANUAL_MODULE_HANDLER */ + +/* OPACITY STRUCTURES *********************************************************/ + +/** \brief Opaque structure for a database environment. + * \details An environment supports multiple key-value sub-databases (aka + * key-value spaces or tables), all residing in the same shared-memory map. + * \see mdbx_env_create() \see mdbx_env_close() */ +#ifndef __cplusplus +typedef struct MDBX_env MDBX_env; +#else +struct MDBX_env; +#endif + +/** \brief Opaque structure for a transaction handle. + * \ingroup c_transactions + * \details All database operations require a transaction handle. Transactions + * may be read-only or read-write. + * \see mdbx_txn_begin() \see mdbx_txn_commit() \see mdbx_txn_abort() */ +#ifndef __cplusplus +typedef struct MDBX_txn MDBX_txn; +#else +struct MDBX_txn; +#endif + +/** \brief A handle for an individual database (key-value spaces) in the + * environment. \ingroup c_dbi \details Zero handle is used internally (hidden + * Garbage Collection DB). So, any valid DBI-handle great than 0 and less than + * or equal \ref MDBX_MAX_DBI. \see mdbx_dbi_open() \see mdbx_dbi_close() */ +typedef uint32_t MDBX_dbi; + +/** \brief Opaque structure for navigating through a database + * \ingroup c_cursors + * \see mdbx_cursor_create() \see mdbx_cursor_bind() \see mdbx_cursor_close() + */ +#ifndef __cplusplus +typedef struct MDBX_cursor MDBX_cursor; +#else +struct MDBX_cursor; +#endif + +/** \brief Generic structure used for passing keys and data in and out of the + * database. + * \anchor MDBX_val \see mdbx::slice \see mdbx::buffer + * + * \details Values returned from the database are valid only until a subsequent + * update operation, or the end of the transaction. Do not modify or + * free them, they commonly point into the database itself. + * + * Key sizes must be between 0 and \ref mdbx_env_get_maxkeysize() inclusive. + * The same applies to data sizes in databases with the \ref MDBX_DUPSORT flag. + * Other data items can in theory be from 0 to \ref MDBX_MAXDATASIZE bytes long. + * + * \note The notable difference between MDBX and LMDB is that MDBX support zero + * length keys. */ +#ifndef HAVE_STRUCT_IOVEC +struct iovec { + void *iov_base; /**< pointer to some data */ + size_t iov_len; /**< the length of data in bytes */ +}; +#define HAVE_STRUCT_IOVEC +#endif /* HAVE_STRUCT_IOVEC */ + +#if defined(__sun) || defined(__SVR4) || defined(__svr4__) +/* The `iov_len` is signed on Sun/Solaris. + * So define custom MDBX_val to avoid a lot of warnings. */ +struct MDBX_val { + void *iov_base; /**< pointer to some data */ + size_t iov_len; /**< the length of data in bytes */ +}; +#ifndef __cplusplus +typedef struct MDBX_val MDBX_val; +#endif +#else /* SunOS */ +typedef struct iovec MDBX_val; +#endif /* ! SunOS */ + +enum MDBX_constants { + /** The hard limit for DBI handles */ + MDBX_MAX_DBI = UINT32_C(32765), + + /** The maximum size of a data item. */ + MDBX_MAXDATASIZE = UINT32_C(0x7fff0000), + + /** The minimal database page size in bytes. */ + MDBX_MIN_PAGESIZE = 256, + + /** The maximal database page size in bytes. */ + MDBX_MAX_PAGESIZE = 65536, +}; + +/* THE FILES ******************************************************************* + * At the file system level, the environment corresponds to a pair of files. */ + +/** \brief The name of the lock file in the environment */ +#define MDBX_LOCKNAME "/mdbx.lck" +/** \brief The name of the data file in the environment */ +#define MDBX_DATANAME "/mdbx.dat" + +/** \brief The suffix of the lock file when \ref MDBX_NOSUBDIR is used */ +#define MDBX_LOCK_SUFFIX "-lck" + +/* DEBUG & LOGGING ************************************************************/ + +/** \addtogroup c_debug + * \note Most of debug feature enabled only when libmdbx builded with + * \ref MDBX_DEBUG build option. @{ */ + +/** Log level (requires build libmdbx with \ref MDBX_DEBUG option) */ +enum MDBX_log_level_t { + /** Critical conditions, i.e. assertion failures */ + MDBX_LOG_FATAL = 0, + + /** Enables logging for error conditions and \ref MDBX_LOG_FATAL */ + MDBX_LOG_ERROR = 1, + + /** Enables logging for warning conditions and \ref MDBX_LOG_ERROR ... + \ref MDBX_LOG_FATAL */ + MDBX_LOG_WARN = 2, + + /** Enables logging for normal but significant condition and + \ref MDBX_LOG_WARN ... \ref MDBX_LOG_FATAL */ + MDBX_LOG_NOTICE = 3, + + /** Enables logging for verbose informational and \ref MDBX_LOG_NOTICE ... + \ref MDBX_LOG_FATAL */ + MDBX_LOG_VERBOSE = 4, + + /** Enables logging for debug-level messages and \ref MDBX_LOG_VERBOSE ... + \ref MDBX_LOG_FATAL */ + MDBX_LOG_DEBUG = 5, + + /** Enables logging for trace debug-level messages and \ref MDBX_LOG_DEBUG ... + \ref MDBX_LOG_FATAL */ + MDBX_LOG_TRACE = 6, + + /** Enables extra debug-level messages (dump pgno lists) + and all other log-messages */ + MDBX_LOG_EXTRA = 7, + + /** for \ref mdbx_setup_debug() only: Don't change current settings */ + MDBX_LOG_DONTCHANGE = -1 +}; +#ifndef __cplusplus +typedef enum MDBX_log_level_t MDBX_log_level_t; +#endif + +/** \brief Runtime debug flags + * + * \details `MDBX_DBG_DUMP` and `MDBX_DBG_LEGACY_MULTIOPEN` always have an + * effect, but `MDBX_DBG_ASSERT`, `MDBX_DBG_AUDIT` and `MDBX_DBG_JITTER` only if + * libmdbx builded with \ref MDBX_DEBUG. */ +enum MDBX_debug_flags_t { + /** Enable assertion checks. + * Requires build with \ref MDBX_DEBUG > 0 */ + MDBX_DBG_ASSERT = 1, + + /** Enable pages usage audit at commit transactions. + * Requires build with \ref MDBX_DEBUG > 0 */ + MDBX_DBG_AUDIT = 2, + + /** Enable small random delays in critical points. + * Requires build with \ref MDBX_DEBUG > 0 */ + MDBX_DBG_JITTER = 4, + + /** Include or not meta-pages in coredump files. + * May affect performance in \ref MDBX_WRITEMAP mode */ + MDBX_DBG_DUMP = 8, + + /** Allow multi-opening environment(s) */ + MDBX_DBG_LEGACY_MULTIOPEN = 16, + + /** Allow read and write transactions overlapping for the same thread */ + MDBX_DBG_LEGACY_OVERLAP = 32, + + /** for mdbx_setup_debug() only: Don't change current settings */ + MDBX_DBG_DONTCHANGE = -1 +}; +#ifndef __cplusplus +typedef enum MDBX_debug_flags_t MDBX_debug_flags_t; +#else +DEFINE_ENUM_FLAG_OPERATORS(MDBX_debug_flags_t) +#endif + +/** \brief A debug-logger callback function, + * called before printing the message and aborting. + * \see mdbx_setup_debug() + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * \param [in] msg The assertion message, not including newline. */ +typedef void MDBX_debug_func(MDBX_log_level_t loglevel, const char *function, + int line, const char *fmt, + va_list args) MDBX_CXX17_NOEXCEPT; + +/** \brief The "don't change `logger`" value for mdbx_setup_debug() */ +#define MDBX_LOGGER_DONTCHANGE ((MDBX_debug_func *)(intptr_t)-1) + +/** \brief Setup global log-level, debug options and debug logger. + * \returns The previously `debug_flags` in the 0-15 bits + * and `log_level` in the 16-31 bits. */ +LIBMDBX_API int mdbx_setup_debug(MDBX_log_level_t log_level, + MDBX_debug_flags_t debug_flags, + MDBX_debug_func *logger); + +/** \brief A callback function for most MDBX assert() failures, + * called before printing the message and aborting. + * \see mdbx_env_set_assert() + * + * \param [in] env An environment handle returned by mdbx_env_create(). + * \param [in] msg The assertion message, not including newline. */ +typedef void MDBX_assert_func(const MDBX_env *env, const char *msg, + const char *function, + unsigned line) MDBX_CXX17_NOEXCEPT; + +/** \brief Set or reset the assert() callback of the environment. + * + * Does nothing if libmdbx was built with MDBX_DEBUG=0 or with NDEBUG, + * and will return `MDBX_ENOSYS` in such case. + * + * \param [in] env An environment handle returned by mdbx_env_create(). + * \param [in] func An MDBX_assert_func function, or 0. + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func); + +/** \brief Dump given MDBX_val to the buffer + * + * Dumps it as string if value is printable (all bytes in the range 0x20..0x7E), + * otherwise made hexadecimal dump. Requires at least 4 byte length buffer. + * + * \returns One of: + * - NULL if given buffer size less than 4 bytes; + * - pointer to constant string if given value NULL or empty; + * - otherwise pointer to given buffer. */ +LIBMDBX_API const char *mdbx_dump_val(const MDBX_val *key, char *const buf, + const size_t bufsize); + +/** \brief Panics with message and causes abnormal process termination. */ +LIBMDBX_API void mdbx_panic(const char *fmt, ...) MDBX_PRINTF_ARGS(1, 2); + +/** @} end of logging & debug */ + +/** \brief Environment flags + * \ingroup c_opening + * \anchor env_flags + * \see mdbx_env_open() \see mdbx_env_set_flags() */ +enum MDBX_env_flags_t { + MDBX_ENV_DEFAULTS = 0, + + /** No environment directory. + * + * By default, MDBX creates its environment in a directory whose pathname is + * given in path, and creates its data and lock files under that directory. + * With this option, path is used as-is for the database main data file. + * The database lock file is the path with "-lck" appended. + * + * - with `MDBX_NOSUBDIR` = in a filesystem we have the pair of MDBX-files + * which names derived from given pathname by appending predefined suffixes. + * + * - without `MDBX_NOSUBDIR` = in a filesystem we have the MDBX-directory with + * given pathname, within that a pair of MDBX-files with predefined names. + * + * This flag affects only at new environment creating by \ref mdbx_env_open(), + * otherwise at opening an existing environment libmdbx will choice this + * automatically. */ + MDBX_NOSUBDIR = UINT32_C(0x4000), + + /** Read only mode. + * + * Open the environment in read-only mode. No write operations will be + * allowed. MDBX will still modify the lock file - except on read-only + * filesystems, where MDBX does not use locks. + * + * - with `MDBX_RDONLY` = open environment in read-only mode. + * MDBX supports pure read-only mode (i.e. without opening LCK-file) only + * when environment directory and/or both files are not writable (and the + * LCK-file may be missing). In such case allowing file(s) to be placed + * on a network read-only share. + * + * - without `MDBX_RDONLY` = open environment in read-write mode. + * + * This flag affects only at environment opening but can't be changed after. + */ + MDBX_RDONLY = UINT32_C(0x20000), + + /** Open environment in exclusive/monopolistic mode. + * + * `MDBX_EXCLUSIVE` flag can be used as a replacement for `MDB_NOLOCK`, + * which don't supported by MDBX. + * In this way, you can get the minimal overhead, but with the correct + * multi-process and multi-thread locking. + * + * - with `MDBX_EXCLUSIVE` = open environment in exclusive/monopolistic mode + * or return \ref MDBX_BUSY if environment already used by other process. + * The main feature of the exclusive mode is the ability to open the + * environment placed on a network share. + * + * - without `MDBX_EXCLUSIVE` = open environment in cooperative mode, + * i.e. for multi-process access/interaction/cooperation. + * The main requirements of the cooperative mode are: + * + * 1. data files MUST be placed in the LOCAL file system, + * but NOT on a network share. + * 2. environment MUST be opened only by LOCAL processes, + * but NOT over a network. + * 3. OS kernel (i.e. file system and memory mapping implementation) and + * all processes that open the given environment MUST be running + * in the physically single RAM with cache-coherency. The only + * exception for cache-consistency requirement is Linux on MIPS + * architecture, but this case has not been tested for a long time). + * + * This flag affects only at environment opening but can't be changed after. + */ + MDBX_EXCLUSIVE = UINT32_C(0x400000), + + /** Using database/environment which already opened by another process(es). + * + * The `MDBX_ACCEDE` flag is useful to avoid \ref MDBX_INCOMPATIBLE error + * while opening the database/environment which is already used by another + * process(es) with unknown mode/flags. In such cases, if there is a + * difference in the specified flags (\ref MDBX_NOMETASYNC, + * \ref MDBX_SAFE_NOSYNC, \ref MDBX_UTTERLY_NOSYNC, \ref MDBX_LIFORECLAIM, + * \ref MDBX_COALESCE and \ref MDBX_NORDAHEAD), instead of returning an error, + * the database will be opened in a compatibility with the already used mode. + * + * `MDBX_ACCEDE` has no effect if the current process is the only one either + * opening the DB in read-only mode or other process(es) uses the DB in + * read-only mode. */ + MDBX_ACCEDE = UINT32_C(0x40000000), + + /** Map data into memory with write permission. + * + * Use a writeable memory map unless \ref MDBX_RDONLY is set. This uses fewer + * mallocs and requires much less work for tracking database pages, but + * loses protection from application bugs like wild pointer writes and other + * bad updates into the database. This may be slightly faster for DBs that + * fit entirely in RAM, but is slower for DBs larger than RAM. Also adds the + * possibility for stray application writes thru pointers to silently + * corrupt the database. + * + * - with `MDBX_WRITEMAP` = all data will be mapped into memory in the + * read-write mode. This offers a significant performance benefit, since the + * data will be modified directly in mapped memory and then flushed to disk + * by single system call, without any memory management nor copying. + * + * - without `MDBX_WRITEMAP` = data will be mapped into memory in the + * read-only mode. This requires stocking all modified database pages in + * memory and then writing them to disk through file operations. + * + * \warning On the other hand, `MDBX_WRITEMAP` adds the possibility for stray + * application writes thru pointers to silently corrupt the database. + * + * \note The `MDBX_WRITEMAP` mode is incompatible with nested transactions, + * since this is unreasonable. I.e. nested transactions requires mallocation + * of database pages and more work for tracking ones, which neuters a + * performance boost caused by the `MDBX_WRITEMAP` mode. + * + * This flag affects only at environment opening but can't be changed after. + */ + MDBX_WRITEMAP = UINT32_C(0x80000), + + /** Tie reader locktable slots to read-only transactions + * instead of to threads. + * + * Don't use Thread-Local Storage, instead tie reader locktable slots to + * \ref MDBX_txn objects instead of to threads. So, \ref mdbx_txn_reset() + * keeps the slot reserved for the \ref MDBX_txn object. A thread may use + * parallel read-only transactions. And a read-only transaction may span + * threads if you synchronizes its use. + * + * Applications that multiplex many user threads over individual OS threads + * need this option. Such an application must also serialize the write + * transactions in an OS thread, since MDBX's write locking is unaware of + * the user threads. + * + * \note Regardless to `MDBX_NOTLS` flag a write transaction entirely should + * always be used in one thread from start to finish. MDBX checks this in a + * reasonable manner and return the \ref MDBX_THREAD_MISMATCH error in rules + * violation. + * + * This flag affects only at environment opening but can't be changed after. + */ + MDBX_NOTLS = UINT32_C(0x200000), + + /** Don't do readahead. + * + * Turn off readahead. Most operating systems perform readahead on read + * requests by default. This option turns it off if the OS supports it. + * Turning it off may help random read performance when the DB is larger + * than RAM and system RAM is full. + * + * By default libmdbx dynamically enables/disables readahead depending on + * the actual database size and currently available memory. On the other + * hand, such automation has some limitation, i.e. could be performed only + * when DB size changing but can't tracks and reacts changing a free RAM + * availability, since it changes independently and asynchronously. + * + * \note The mdbx_is_readahead_reasonable() function allows to quickly find + * out whether to use readahead or not based on the size of the data and the + * amount of available memory. + * + * This flag affects only at environment opening and can't be changed after. + */ + MDBX_NORDAHEAD = UINT32_C(0x800000), + + /** Don't initialize malloc'ed memory before writing to datafile. + * + * Don't initialize malloc'ed memory before writing to unused spaces in the + * data file. By default, memory for pages written to the data file is + * obtained using malloc. While these pages may be reused in subsequent + * transactions, freshly malloc'ed pages will be initialized to zeroes before + * use. This avoids persisting leftover data from other code (that used the + * heap and subsequently freed the memory) into the data file. + * + * Note that many other system libraries may allocate and free memory from + * the heap for arbitrary uses. E.g., stdio may use the heap for file I/O + * buffers. This initialization step has a modest performance cost so some + * applications may want to disable it using this flag. This option can be a + * problem for applications which handle sensitive data like passwords, and + * it makes memory checkers like Valgrind noisy. This flag is not needed + * with \ref MDBX_WRITEMAP, which writes directly to the mmap instead of using + * malloc for pages. The initialization is also skipped if \ref MDBX_RESERVE + * is used; the caller is expected to overwrite all of the memory that was + * reserved in that case. + * + * This flag may be changed at any time using `mdbx_env_set_flags()`. */ + MDBX_NOMEMINIT = UINT32_C(0x1000000), + + /** Aims to coalesce a Garbage Collection items. + * + * With `MDBX_COALESCE` flag MDBX will aims to coalesce items while recycling + * a Garbage Collection. Technically, when possible short lists of pages + * will be combined into longer ones, but to fit on one database page. As a + * result, there will be fewer items in Garbage Collection and a page lists + * are longer, which slightly increases the likelihood of returning pages to + * Unallocated space and reducing the database file. + * + * This flag may be changed at any time using mdbx_env_set_flags(). */ + MDBX_COALESCE = UINT32_C(0x2000000), + + /** LIFO policy for recycling a Garbage Collection items. + * + * `MDBX_LIFORECLAIM` flag turns on LIFO policy for recycling a Garbage + * Collection items, instead of FIFO by default. On systems with a disk + * write-back cache, this can significantly increase write performance, up + * to several times in a best case scenario. + * + * LIFO recycling policy means that for reuse pages will be taken which became + * unused the lastest (i.e. just now or most recently). Therefore the loop of + * database pages circulation becomes as short as possible. In other words, + * the number of pages, that are overwritten in memory and on disk during a + * series of write transactions, will be as small as possible. Thus creates + * ideal conditions for the efficient operation of the disk write-back cache. + * + * \ref MDBX_LIFORECLAIM is compatible with all no-sync flags, but gives NO + * noticeable impact in combination with \ref MDBX_SAFE_NOSYNC or + * \ref MDBX_UTTERLY_NOSYNC. Because MDBX will reused pages only before the + * last "steady" MVCC-snapshot, i.e. the loop length of database pages + * circulation will be mostly defined by frequency of calling + * \ref mdbx_env_sync() rather than LIFO and FIFO difference. + * + * This flag may be changed at any time using mdbx_env_set_flags(). */ + MDBX_LIFORECLAIM = UINT32_C(0x4000000), + + /** Debugging option, fill/perturb released pages. */ + MDBX_PAGEPERTURB = UINT32_C(0x8000000), + + /* SYNC MODES****************************************************************/ + /** \defgroup sync_modes SYNC MODES + * + * \attention Using any combination of \ref MDBX_SAFE_NOSYNC, \ref + * MDBX_NOMETASYNC and especially \ref MDBX_UTTERLY_NOSYNC is always a deal to + * reduce durability for gain write performance. You must know exactly what + * you are doing and what risks you are taking! + * + * \note for LMDB users: \ref MDBX_SAFE_NOSYNC is NOT similar to LMDB_NOSYNC, + * but \ref MDBX_UTTERLY_NOSYNC is exactly match LMDB_NOSYNC. See details + * below. + * + * THE SCENE: + * - The DAT-file contains several MVCC-snapshots of B-tree at same time, + * each of those B-tree has its own root page. + * - Each of meta pages at the beginning of the DAT file contains a + * pointer to the root page of B-tree which is the result of the particular + * transaction, and a number of this transaction. + * - For data durability, MDBX must first write all MVCC-snapshot data + * pages and ensure that are written to the disk, then update a meta page + * with the new transaction number and a pointer to the corresponding new + * root page, and flush any buffers yet again. + * - Thus during commit a I/O buffers should be flushed to the disk twice; + * i.e. fdatasync(), FlushFileBuffers() or similar syscall should be + * called twice for each commit. This is very expensive for performance, + * but guaranteed durability even on unexpected system failure or power + * outage. Of course, provided that the operating system and the + * underlying hardware (e.g. disk) work correctly. + * + * TRADE-OFF: + * By skipping some stages described above, you can significantly benefit in + * speed, while partially or completely losing in the guarantee of data + * durability and/or consistency in the event of system or power failure. + * Moreover, if for any reason disk write order is not preserved, then at + * moment of a system crash, a meta-page with a pointer to the new B-tree may + * be written to disk, while the itself B-tree not yet. In that case, the + * database will be corrupted! + * + * \see MDBX_SYNC_DURABLE \see MDBX_NOMETASYNC \see MDBX_SAFE_NOSYNC + * \see MDBX_UTTERLY_NOSYNC + * + * @{ */ + + /** Default robust and durable sync mode. + * + * Metadata is written and flushed to disk after a data is written and + * flushed, which guarantees the integrity of the database in the event + * of a crash at any time. + * + * \attention Please do not use other modes until you have studied all the + * details and are sure. Otherwise, you may lose your users' data, as happens + * in [Miranda NG](https://www.miranda-ng.org/) messenger. */ + MDBX_SYNC_DURABLE = 0, + + /** Don't sync the meta-page after commit. + * + * Flush system buffers to disk only once per transaction commit, omit the + * metadata flush. Defer that until the system flushes files to disk, + * or next non-\ref MDBX_RDONLY commit or \ref mdbx_env_sync(). Depending on + * the platform and hardware, with \ref MDBX_NOMETASYNC you may get a doubling + * of write performance. + * + * This trade-off maintains database integrity, but a system crash may + * undo the last committed transaction. I.e. it preserves the ACI + * (atomicity, consistency, isolation) but not D (durability) database + * property. + * + * `MDBX_NOMETASYNC` flag may be changed at any time using + * \ref mdbx_env_set_flags() or by passing to \ref mdbx_txn_begin() for + * particular write transaction. \see sync_modes */ + MDBX_NOMETASYNC = UINT32_C(0x40000), + + /** Don't sync anything but keep previous steady commits. + * + * Like \ref MDBX_UTTERLY_NOSYNC the `MDBX_SAFE_NOSYNC` flag disable similarly + * flush system buffers to disk when committing a transaction. But there is a + * huge difference in how are recycled the MVCC snapshots corresponding to + * previous "steady" transactions (see below). + * + * With \ref MDBX_WRITEMAP the `MDBX_SAFE_NOSYNC` instructs MDBX to use + * asynchronous mmap-flushes to disk. Asynchronous mmap-flushes means that + * actually all writes will scheduled and performed by operation system on it + * own manner, i.e. unordered. MDBX itself just notify operating system that + * it would be nice to write data to disk, but no more. + * + * Depending on the platform and hardware, with `MDBX_SAFE_NOSYNC` you may get + * a multiple increase of write performance, even 10 times or more. + * + * In contrast to \ref MDBX_UTTERLY_NOSYNC mode, with `MDBX_SAFE_NOSYNC` flag + * MDBX will keeps untouched pages within B-tree of the last transaction + * "steady" which was synced to disk completely. This has big implications for + * both data durability and (unfortunately) performance: + * - a system crash can't corrupt the database, but you will lose the last + * transactions; because MDBX will rollback to last steady commit since it + * kept explicitly. + * - the last steady transaction makes an effect similar to "long-lived" read + * transaction (see above in the \ref restrictions section) since prevents + * reuse of pages freed by newer write transactions, thus the any data + * changes will be placed in newly allocated pages. + * - to avoid rapid database growth, the system will sync data and issue + * a steady commit-point to resume reuse pages, each time there is + * insufficient space and before increasing the size of the file on disk. + * + * In other words, with `MDBX_SAFE_NOSYNC` flag MDBX insures you from the + * whole database corruption, at the cost increasing database size and/or + * number of disk IOPs. So, `MDBX_SAFE_NOSYNC` flag could be used with + * \ref mdbx_env_sync() as alternatively for batch committing or nested + * transaction (in some cases). As well, auto-sync feature exposed by + * \ref mdbx_env_set_syncbytes() and \ref mdbx_env_set_syncperiod() functions + * could be very useful with `MDBX_SAFE_NOSYNC` flag. + * + * The number and volume of of disk IOPs with MDBX_SAFE_NOSYNC flag will + * exactly the as without any no-sync flags. However, you should expect a + * larger process's [work set](https://bit.ly/2kA2tFX) and significantly worse + * a [locality of reference](https://bit.ly/2mbYq2J), due to the more + * intensive allocation of previously unused pages and increase the size of + * the database. + * + * `MDBX_SAFE_NOSYNC` flag may be changed at any time using + * \ref mdbx_env_set_flags() or by passing to \ref mdbx_txn_begin() for + * particular write transaction. */ + MDBX_SAFE_NOSYNC = UINT32_C(0x10000), + + /** \deprecated Please use \ref MDBX_SAFE_NOSYNC instead of `MDBX_MAPASYNC`. + * + * Since version 0.9.x the `MDBX_MAPASYNC` is deprecated and has the same + * effect as \ref MDBX_SAFE_NOSYNC with \ref MDBX_WRITEMAP. This just API + * simplification is for convenience and clarity. */ + MDBX_MAPASYNC = MDBX_SAFE_NOSYNC, + + /** Don't sync anything and wipe previous steady commits. + * + * Don't flush system buffers to disk when committing a transaction. This + * optimization means a system crash can corrupt the database, if buffers are + * not yet flushed to disk. Depending on the platform and hardware, with + * `MDBX_UTTERLY_NOSYNC` you may get a multiple increase of write performance, + * even 100 times or more. + * + * If the filesystem preserves write order (which is rare and never provided + * unless explicitly noted) and the \ref MDBX_WRITEMAP and \ref + * MDBX_LIFORECLAIM flags are not used, then a system crash can't corrupt the + * database, but you can lose the last transactions, if at least one buffer is + * not yet flushed to disk. The risk is governed by how often the system + * flushes dirty buffers to disk and how often \ref mdbx_env_sync() is called. + * So, transactions exhibit ACI (atomicity, consistency, isolation) properties + * and only lose `D` (durability). I.e. database integrity is maintained, but + * a system crash may undo the final transactions. + * + * Otherwise, if the filesystem not preserves write order (which is + * typically) or \ref MDBX_WRITEMAP or \ref MDBX_LIFORECLAIM flags are used, + * you should expect the corrupted database after a system crash. + * + * So, most important thing about `MDBX_UTTERLY_NOSYNC`: + * - a system crash immediately after commit the write transaction + * high likely lead to database corruption. + * - successful completion of mdbx_env_sync(force = true) after one or + * more committed transactions guarantees consistency and durability. + * - BUT by committing two or more transactions you back database into + * a weak state, in which a system crash may lead to database corruption! + * In case single transaction after mdbx_env_sync, you may lose transaction + * itself, but not a whole database. + * + * Nevertheless, `MDBX_UTTERLY_NOSYNC` provides "weak" durability in case + * of an application crash (but no durability on system failure), and + * therefore may be very useful in scenarios where data durability is + * not required over a system failure (e.g for short-lived data), or if you + * can take such risk. + * + * `MDBX_UTTERLY_NOSYNC` flag may be changed at any time using + * \ref mdbx_env_set_flags(), but don't has effect if passed to + * \ref mdbx_txn_begin() for particular write transaction. \see sync_modes */ + MDBX_UTTERLY_NOSYNC = MDBX_SAFE_NOSYNC | UINT32_C(0x100000), + + /** @} end of SYNC MODES */ +}; +#ifndef __cplusplus +/** \ingroup c_opening */ +typedef enum MDBX_env_flags_t MDBX_env_flags_t; +#else +DEFINE_ENUM_FLAG_OPERATORS(MDBX_env_flags_t) +#endif + +/** Transaction flags + * \ingroup c_transactions + * \anchor txn_flags + * \see mdbx_txn_begin() \see mdbx_txn_flags() */ +enum MDBX_txn_flags_t { + /** Start read-write transaction. + * + * Only one write transaction may be active at a time. Writes are fully + * serialized, which guarantees that writers can never deadlock. */ + MDBX_TXN_READWRITE = 0, + + /** Start read-only transaction. + * + * There can be multiple read-only transactions simultaneously that do not + * block each other and a write transactions. */ + MDBX_TXN_RDONLY = MDBX_RDONLY, + +/** Prepare but not start read-only transaction. + * + * Transaction will not be started immediately, but created transaction handle + * will be ready for use with \ref mdbx_txn_renew(). This flag allows to + * preallocate memory and assign a reader slot, thus avoiding these operations + * at the next start of the transaction. */ +#if CONSTEXPR_ENUM_FLAGS_OPERATIONS || defined(DOXYGEN) + MDBX_TXN_RDONLY_PREPARE = MDBX_RDONLY | MDBX_NOMEMINIT, +#else + MDBX_TXN_RDONLY_PREPARE = uint32_t(MDBX_RDONLY) | uint32_t(MDBX_NOMEMINIT), +#endif + + /** Do not block when starting a write transaction. */ + MDBX_TXN_TRY = UINT32_C(0x10000000), + + /** Exactly the same as \ref MDBX_NOMETASYNC, + * but for this transaction only */ + MDBX_TXN_NOMETASYNC = MDBX_NOMETASYNC, + + /** Exactly the same as \ref MDBX_SAFE_NOSYNC, + * but for this transaction only */ + MDBX_TXN_NOSYNC = MDBX_SAFE_NOSYNC +}; +#ifndef __cplusplus +typedef enum MDBX_txn_flags_t MDBX_txn_flags_t; +#else +DEFINE_ENUM_FLAG_OPERATORS(MDBX_txn_flags_t) +#endif + +/** \brief Database flags + * \ingroup c_dbi + * \anchor db_flags + * \see mdbx_dbi_open() */ +enum MDBX_db_flags_t { + MDBX_DB_DEFAULTS = 0, + + /** Use reverse string keys */ + MDBX_REVERSEKEY = UINT32_C(0x02), + + /** Use sorted duplicates, i.e. allow multi-values */ + MDBX_DUPSORT = UINT32_C(0x04), + + /** Numeric keys in native byte order either uint32_t or uint64_t. The keys + * must all be of the same size and must be aligned while passing as + * arguments. */ + MDBX_INTEGERKEY = UINT32_C(0x08), + + /** With \ref MDBX_DUPSORT; sorted dup items have fixed size */ + MDBX_DUPFIXED = UINT32_C(0x10), + + /** With \ref MDBX_DUPSORT and with \ref MDBX_DUPFIXED; dups are fixed size + * \ref MDBX_INTEGERKEY -style integers. The data values must all be of the + * same size and must be aligned while passing as arguments. */ + MDBX_INTEGERDUP = UINT32_C(0x20), + + /** With \ref MDBX_DUPSORT; use reverse string comparison */ + MDBX_REVERSEDUP = UINT32_C(0x40), + + /** Create DB if not already existing */ + MDBX_CREATE = UINT32_C(0x40000), + + /** Opens an existing sub-database created with unknown flags. + * + * The `MDBX_DB_ACCEDE` flag is intend to open a existing sub-database which + * was created with unknown flags (\ref MDBX_REVERSEKEY, \ref MDBX_DUPSORT, + * \ref MDBX_INTEGERKEY, \ref MDBX_DUPFIXED, \ref MDBX_INTEGERDUP and + * \ref MDBX_REVERSEDUP). + * + * In such cases, instead of returning the \ref MDBX_INCOMPATIBLE error, the + * sub-database will be opened with flags which it was created, and then an + * application could determine the actual flags by \ref mdbx_dbi_flags(). */ + MDBX_DB_ACCEDE = MDBX_ACCEDE +}; +#ifndef __cplusplus +/** \ingroup c_dbi */ +typedef enum MDBX_db_flags_t MDBX_db_flags_t; +#else +DEFINE_ENUM_FLAG_OPERATORS(MDBX_db_flags_t) +#endif + +/** \brief Data changing flags + * \ingroup c_crud + * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" + * \see mdbx_put() \see mdbx_cursor_put() \see mdbx_replace() */ +enum MDBX_put_flags_t { + /** Upsertion by default (without any other flags) */ + MDBX_UPSERT = 0, + + /** For insertion: Don't write if the key already exists. */ + MDBX_NOOVERWRITE = UINT32_C(0x10), + + /** Has effect only for \ref MDBX_DUPSORT databases. + * For upsertion: don't write if the key-value pair already exist. + * For deletion: remove all values for key. */ + MDBX_NODUPDATA = UINT32_C(0x20), + + /** For upsertion: overwrite the current key/data pair. + * MDBX allows this flag for \ref mdbx_put() for explicit overwrite/update + * without insertion. + * For deletion: remove only single entry at the current cursor position. */ + MDBX_CURRENT = UINT32_C(0x40), + + /** Has effect only for \ref MDBX_DUPSORT databases. + * For deletion: remove all multi-values (aka duplicates) for given key. + * For upsertion: replace all multi-values for given key with a new one. */ + MDBX_ALLDUPS = UINT32_C(0x80), + + /** For upsertion: Just reserve space for data, don't copy it. + * Return a pointer to the reserved space. */ + MDBX_RESERVE = UINT32_C(0x10000), + + /** Data is being appended. + * Don't split full pages, continue on a new instead. */ + MDBX_APPEND = UINT32_C(0x20000), + + /** Has effect only for \ref MDBX_DUPSORT databases. + * Duplicate data is being appended. + * Don't split full pages, continue on a new instead. */ + MDBX_APPENDDUP = UINT32_C(0x40000), + + /** Only for \ref MDBX_DUPFIXED. + * Store multiple data items in one call. */ + MDBX_MULTIPLE = UINT32_C(0x80000) +}; +#ifndef __cplusplus +/** \ingroup c_crud */ +typedef enum MDBX_put_flags_t MDBX_put_flags_t; +#else +DEFINE_ENUM_FLAG_OPERATORS(MDBX_put_flags_t) +#endif + +/** \brief Environment copy flags + * \ingroup c_extra + * \see mdbx_env_copy() \see mdbx_env_copy2fd() */ +enum MDBX_copy_flags_t { + MDBX_CP_DEFAULTS = 0, + + /** Copy with compactification: Omit free space from copy and renumber all + * pages sequentially */ + MDBX_CP_COMPACT = 1u, + + /** Force to make resizeable copy, i.e. dynamic size instead of fixed */ + MDBX_CP_FORCE_DYNAMIC_SIZE = 2u +}; +#ifndef __cplusplus +/** \ingroup c_extra */ +typedef enum MDBX_copy_flags_t MDBX_copy_flags_t; +#else +DEFINE_ENUM_FLAG_OPERATORS(MDBX_copy_flags_t) +#endif + +/** \brief Cursor operations + * \ingroup c_cursors + * This is the set of all operations for retrieving data using a cursor. + * \see mdbx_cursor_get() */ +enum MDBX_cursor_op { + /** Position at first key/data item */ + MDBX_FIRST, + + /** \ref MDBX_DUPSORT -only: Position at first data item of current key. */ + MDBX_FIRST_DUP, + + /** \ref MDBX_DUPSORT -only: Position at key/data pair. */ + MDBX_GET_BOTH, + + /** \ref MDBX_DUPSORT -only: Position at given key and at first data greater + * than or equal to specified data. */ + MDBX_GET_BOTH_RANGE, + + /** Return key/data at current cursor position */ + MDBX_GET_CURRENT, + + /** \ref MDBX_DUPFIXED -only: Return up to a page of duplicate data items + * from current cursor position. Move cursor to prepare + * for \ref MDBX_NEXT_MULTIPLE. */ + MDBX_GET_MULTIPLE, + + /** Position at last key/data item */ + MDBX_LAST, + + /** \ref MDBX_DUPSORT -only: Position at last data item of current key. */ + MDBX_LAST_DUP, + + /** Position at next data item */ + MDBX_NEXT, + + /** \ref MDBX_DUPSORT -only: Position at next data item of current key. */ + MDBX_NEXT_DUP, + + /** \ref MDBX_DUPFIXED -only: Return up to a page of duplicate data items + * from next cursor position. Move cursor to prepare + * for `MDBX_NEXT_MULTIPLE`. */ + MDBX_NEXT_MULTIPLE, + + /** Position at first data item of next key */ + MDBX_NEXT_NODUP, + + /** Position at previous data item */ + MDBX_PREV, + + /** \ref MDBX_DUPSORT -only: Position at previous data item of current key. */ + MDBX_PREV_DUP, + + /** Position at last data item of previous key */ + MDBX_PREV_NODUP, + + /** Position at specified key */ + MDBX_SET, + + /** Position at specified key, return both key and data */ + MDBX_SET_KEY, + + /** Position at first key greater than or equal to specified key. */ + MDBX_SET_RANGE, + + /** \ref MDBX_DUPFIXED -only: Position at previous page and return up to + * a page of duplicate data items. */ + MDBX_PREV_MULTIPLE, + + /** Position at first key-value pair greater than or equal to specified, + * return both key and data, and the return code depends on a exact match. + * + * For non DUPSORT-ed collections this work the same to \ref MDBX_SET_RANGE, + * but returns \ref MDBX_SUCCESS if key found exactly and + * \ref MDBX_RESULT_TRUE if greater key was found. + * + * For DUPSORT-ed a data value is taken into account for duplicates, + * i.e. for a pairs/tuples of a key and an each data value of duplicates. + * Returns \ref MDBX_SUCCESS if key-value pair found exactly and + * \ref MDBX_RESULT_TRUE if the next pair was returned. */ + MDBX_SET_LOWERBOUND +}; +#ifndef __cplusplus +/** \ingroup c_cursors */ +typedef enum MDBX_cursor_op MDBX_cursor_op; +#endif + +/** \brief Errors and return codes + * \ingroup c_err + * + * BerkeleyDB uses -30800 to -30999, we'll go under them + * \see mdbx_strerror() \see mdbx_strerror_r() \see mdbx_liberr2str() */ +enum MDBX_error_t { + /** Successful result */ + MDBX_SUCCESS = 0, + + /** Alias for \ref MDBX_SUCCESS */ + MDBX_RESULT_FALSE = MDBX_SUCCESS, + + /** Successful result with special meaning or a flag */ + MDBX_RESULT_TRUE = -1, + + /** key/data pair already exists */ + MDBX_KEYEXIST = -30799, + + /** The first LMDB-compatible defined error code */ + MDBX_FIRST_LMDB_ERRCODE = MDBX_KEYEXIST, + + /** key/data pair not found (EOF) */ + MDBX_NOTFOUND = -30798, + + /** Requested page not found - this usually indicates corruption */ + MDBX_PAGE_NOTFOUND = -30797, + + /** Database is corrupted (page was wrong type and so on) */ + MDBX_CORRUPTED = -30796, + + /** Environment had fatal error, + * i.e. update of meta page failed and so on. */ + MDBX_PANIC = -30795, + + /** DB file version mismatch with libmdbx */ + MDBX_VERSION_MISMATCH = -30794, + + /** File is not a valid MDBX file */ + MDBX_INVALID = -30793, + + /** Environment mapsize reached */ + MDBX_MAP_FULL = -30792, + + /** Environment maxdbs reached */ + MDBX_DBS_FULL = -30791, + + /** Environment maxreaders reached */ + MDBX_READERS_FULL = -30790, + + /** Transaction has too many dirty pages, i.e transaction too big */ + MDBX_TXN_FULL = -30788, + + /** Cursor stack too deep - this usually indicates corruption, + * i.e branch-pages loop */ + MDBX_CURSOR_FULL = -30787, + + /** Page has not enough space - internal error */ + MDBX_PAGE_FULL = -30786, + + /** Database engine was unable to extend mapping, e.g. since address space + * is unavailable or busy. This can mean: + * - Database size extended by other process beyond to environment mapsize + * and engine was unable to extend mapping while starting read + * transaction. Environment should be reopened to continue. + * - Engine was unable to extend mapping during write transaction + * or explicit call of \ref mdbx_env_set_geometry(). */ + MDBX_UNABLE_EXTEND_MAPSIZE = -30785, + + /** Environment or database is not compatible with the requested operation + * or the specified flags. This can mean: + * - The operation expects an \ref MDBX_DUPSORT / \ref MDBX_DUPFIXED + * database. + * - Opening a named DB when the unnamed DB has \ref MDBX_DUPSORT / + * \ref MDBX_INTEGERKEY. + * - Accessing a data record as a database, or vice versa. + * - The database was dropped and recreated with different flags. */ + MDBX_INCOMPATIBLE = -30784, + + /** Invalid reuse of reader locktable slot, + * e.g. read-transaction already run for current thread */ + MDBX_BAD_RSLOT = -30783, + + /** Transaction is not valid for requested operation, + * e.g. had errored and be must aborted, has a child, or is invalid */ + MDBX_BAD_TXN = -30782, + + /** Invalid size or alignment of key or data for target database, + * either invalid subDB name */ + MDBX_BAD_VALSIZE = -30781, + + /** The specified DBI-handle is invalid + * or changed by another thread/transaction */ + MDBX_BAD_DBI = -30780, + + /** Unexpected internal error, transaction should be aborted */ + MDBX_PROBLEM = -30779, + + /** The last LMDB-compatible defined error code */ + MDBX_LAST_LMDB_ERRCODE = MDBX_PROBLEM, + + /** Another write transaction is running or environment is already used while + * opening with \ref MDBX_EXCLUSIVE flag */ + MDBX_BUSY = -30778, + + /** The first of MDBX-added error codes */ + MDBX_FIRST_ADDED_ERRCODE = MDBX_BUSY, + + /** The specified key has more than one associated value */ + MDBX_EMULTIVAL = -30421, + + /** Bad signature of a runtime object(s), this can mean: + * - memory corruption or double-free; + * - ABI version mismatch (rare case); */ + MDBX_EBADSIGN = -30420, + + /** Database should be recovered, but this could NOT be done for now + * since it opened in read-only mode */ + MDBX_WANNA_RECOVERY = -30419, + + /** The given key value is mismatched to the current cursor position */ + MDBX_EKEYMISMATCH = -30418, + + /** Database is too large for current system, + * e.g. could NOT be mapped into RAM. */ + MDBX_TOO_LARGE = -30417, + + /** A thread has attempted to use a not owned object, + * e.g. a transaction that started by another thread. */ + MDBX_THREAD_MISMATCH = -30416, + + /** Overlapping read and write transactions for the current thread */ + MDBX_TXN_OVERLAPPING = -30415, + + /* The last of MDBX-added error codes */ + MDBX_LAST_ADDED_ERRCODE = MDBX_TXN_OVERLAPPING, + +#if defined(_WIN32) || defined(_WIN64) + MDBX_ENODATA = ERROR_HANDLE_EOF, + MDBX_EINVAL = ERROR_INVALID_PARAMETER, + MDBX_EACCESS = ERROR_ACCESS_DENIED, + MDBX_ENOMEM = ERROR_OUTOFMEMORY, + MDBX_EROFS = ERROR_FILE_READ_ONLY, + MDBX_ENOSYS = ERROR_NOT_SUPPORTED, + MDBX_EIO = ERROR_WRITE_FAULT, + MDBX_EPERM = ERROR_INVALID_FUNCTION, + MDBX_EINTR = ERROR_CANCELLED, + MDBX_ENOFILE = ERROR_FILE_NOT_FOUND, + MDBX_EREMOTE = ERROR_REMOTE_STORAGE_MEDIA_ERROR +#else /* Windows */ +#ifdef ENODATA + MDBX_ENODATA = ENODATA, +#else + MDBX_ENODATA = -1, +#endif /* ENODATA */ + MDBX_EINVAL = EINVAL, + MDBX_EACCESS = EACCES, + MDBX_ENOMEM = ENOMEM, + MDBX_EROFS = EROFS, + MDBX_ENOSYS = ENOSYS, + MDBX_EIO = EIO, + MDBX_EPERM = EPERM, + MDBX_EINTR = EINTR, + MDBX_ENOFILE = ENOENT, + MDBX_EREMOTE = ENOTBLK +#endif /* !Windows */ +}; +#ifndef __cplusplus +/** \ingroup c_err */ +typedef enum MDBX_error_t MDBX_error_t; +#endif + +/** MDBX_MAP_RESIZED + * \ingroup c_err + * \deprecated Please review your code to use MDBX_UNABLE_EXTEND_MAPSIZE + * instead. */ +MDBX_DEPRECATED static __inline int MDBX_MAP_RESIZED_is_deprecated() { + return MDBX_UNABLE_EXTEND_MAPSIZE; +} +#define MDBX_MAP_RESIZED MDBX_MAP_RESIZED_is_deprecated() + +/** \brief Return a string describing a given error code. + * \ingroup c_err + * + * This function is a superset of the ANSI C X3.159-1989 (ANSI C) `strerror()` + * function. If the error code is greater than or equal to 0, then the string + * returned by the system function `strerror()` is returned. If the error code + * is less than 0, an error string corresponding to the MDBX library error is + * returned. See errors for a list of MDBX-specific error codes. + * + * `mdbx_strerror()` is NOT thread-safe because may share common internal buffer + * for system messages. The returned string must NOT be modified by the + * application, but MAY be modified by a subsequent call to + * \ref mdbx_strerror(), `strerror()` and other related functions. + * \see mdbx_strerror_r() + * + * \param [in] errnum The error code. + * + * \returns "error message" The description of the error. */ +LIBMDBX_API const char *mdbx_strerror(int errnum); + +/** \brief Return a string describing a given error code. + * \ingroup c_err + * + * This function is a superset of the ANSI C X3.159-1989 (ANSI C) `strerror()` + * function. If the error code is greater than or equal to 0, then the string + * returned by the system function `strerror()` is returned. If the error code + * is less than 0, an error string corresponding to the MDBX library error is + * returned. See errors for a list of MDBX-specific error codes. + * + * `mdbx_strerror_r()` is thread-safe since uses user-supplied buffer where + * appropriate. The returned string must NOT be modified by the application, + * since it may be pointer to internal constant string. However, there is no + * restriction if the returned string points to the supplied buffer. + * \see mdbx_strerror() + * + * mdbx_liberr2str() returns string describing only MDBX error numbers but NULL + * for non-MDBX error codes. This function is thread-safe since return pointer + * to constant non-localized strings. + * + * \param [in] errnum The error code. + * \param [in,out] buf Buffer to store the error message. + * \param [in] buflen The size of buffer to store the message. + * + * \returns "error message" The description of the error. */ +LIBMDBX_API const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen); +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API const char *mdbx_liberr2str(int errnum); + +#if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) +/** Bit of Windows' madness. The similar to \ref mdbx_strerror() but returns + * Windows error-messages in the OEM-encoding for console utilities. + * \ingroup c_err + * \see mdbx_strerror_r_ANSI2OEM() */ +LIBMDBX_API const char *mdbx_strerror_ANSI2OEM(int errnum); + +/** Bit of Windows' madness. The similar to \ref mdbx_strerror_r() but returns + * Windows error-messages in the OEM-encoding for console utilities. + * \ingroup c_err + * \see mdbx_strerror_ANSI2OEM() */ +LIBMDBX_API const char *mdbx_strerror_r_ANSI2OEM(int errnum, char *buf, + size_t buflen); +#endif /* Bit of Windows' madness */ + +/** \brief Create an MDBX environment instance. + * \ingroup c_opening + * + * This function allocates memory for a \ref MDBX_env structure. To release + * the allocated memory and discard the handle, call \ref mdbx_env_close(). + * Before the handle may be used, it must be opened using \ref mdbx_env_open(). + * + * Various other options may also need to be set before opening the handle, + * e.g. \ref mdbx_env_set_geometry(), \ref mdbx_env_set_maxreaders(), + * \ref mdbx_env_set_maxdbs(), depending on usage requirements. + * + * \param [out] penv The address where the new handle will be stored. + * + * \returns a non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_env_create(MDBX_env **penv); + +/** \brief MDBX environment options. */ +enum MDBX_option_t { + /** \brief Controls the maximum number of named databases for the environment. + * + * \details By default only unnamed key-value database could used and + * appropriate value should set by `MDBX_opt_max_db` to using any more named + * subDB(s). To reduce overhead, use the minimum sufficient value. This option + * may only set after \ref mdbx_env_create() and before \ref mdbx_env_open(). + * + * \see mdbx_env_set_maxdbs() \see mdbx_env_get_maxdbs() */ + MDBX_opt_max_db, + + /** \brief Defines the maximum number of threads/reader slots + * for all processes interacting with the database. + * + * \details This defines the number of slots in the lock table that is used to + * track readers in the the environment. The default is about 100 for 4K + * system page size. Starting a read-only transaction normally ties a lock + * table slot to the current thread until the environment closes or the thread + * exits. If \ref MDBX_NOTLS is in use, \ref mdbx_txn_begin() instead ties the + * slot to the \ref MDBX_txn object until it or the \ref MDBX_env object is + * destroyed. This option may only set after \ref mdbx_env_create() and before + * \ref mdbx_env_open(), and has an effect only when the database is opened by + * the first process interacts with the database. + * + * \see mdbx_env_set_maxreaders() \see mdbx_env_get_maxreaders() */ + MDBX_opt_max_readers, + + /** \brief Controls interprocess/shared threshold to force flush the data + * buffers to disk, if \ref MDBX_SAFE_NOSYNC is used. + * + * \see mdbx_env_set_syncbytes() \see mdbx_env_get_syncbytes() */ + MDBX_opt_sync_bytes, + + /** \brief Controls interprocess/shared relative period since the last + * unsteady commit to force flush the data buffers to disk, + * if \ref MDBX_SAFE_NOSYNC is used. + * \see mdbx_env_set_syncperiod() \see mdbx_env_get_syncperiod() */ + MDBX_opt_sync_period, + + /** \brief Controls the in-process limit to grow a list of reclaimed/recycled + * page's numbers for finding a sequence of contiguous pages for large data + * items. + * + * \details A long values requires allocation of contiguous database pages. + * To find such sequences, it may be necessary to accumulate very large lists, + * especially when placing very long values (more than a megabyte) in a large + * databases (several tens of gigabytes), which is much expensive in extreme + * cases. This threshold allows you to avoid such costs by allocating new + * pages at the end of the database (with its possible growth on disk), + * instead of further accumulating/reclaiming Garbage Collection records. + * + * On the other hand, too small threshold will lead to unreasonable database + * growth, or/and to the inability of put long values. + * + * The `MDBX_opt_rp_augment_limit` controls described limit for the current + * process. Default is 262144, it is usually enough for most cases. */ + MDBX_opt_rp_augment_limit, + + /** \brief Controls the in-process limit to grow a cache of dirty + * pages for reuse in the current transaction. + * + * \details A 'dirty page' refers to a page that has been updated in memory + * only, the changes to a dirty page are not yet stored on disk. + * To reduce overhead, it is reasonable to release not all such pages + * immediately, but to leave some ones in cache for reuse in the current + * transaction. + * + * The `MDBX_opt_loose_limit` allows you to set a limit for such cache inside + * the current process. Should be in the range 0..255, default is 64. */ + MDBX_opt_loose_limit, + + /** \brief Controls the in-process limit of a pre-allocated memory items + * for dirty pages. + * + * \details A 'dirty page' refers to a page that has been updated in memory + * only, the changes to a dirty page are not yet stored on disk. + * Without \ref MDBX_WRITEMAP dirty pages are allocated from memory and + * released when a transaction is committed. To reduce overhead, it is + * reasonable to release not all ones, but to leave some allocations in + * reserve for reuse in the next transaction(s). + * + * The `MDBX_opt_dp_reserve_limit` allows you to set a limit for such reserve + * inside the current process. Default is 1024. */ + MDBX_opt_dp_reserve_limit, + + /** \brief Controls the in-process limit of dirty pages + * for a write transaction. + * + * \details A 'dirty page' refers to a page that has been updated in memory + * only, the changes to a dirty page are not yet stored on disk. + * Without \ref MDBX_WRITEMAP dirty pages are allocated from memory and will + * be busy until are written to disk. Therefore for a large transactions is + * reasonable to limit dirty pages collecting above an some threshold but + * spill to disk instead. + * + * The `MDBX_opt_txn_dp_limit` controls described threshold for the current + * process. Default is 65536, it is usually enough for most cases. */ + MDBX_opt_txn_dp_limit, + + /** \brief Controls the in-process initial allocation size for dirty pages + * list of a write transaction. Default is 1024. */ + MDBX_opt_txn_dp_initial, + + /** \brief Controls the in-process how maximal part of the dirty pages may be + * spilled when necessary. + * + * \details The `MDBX_opt_spill_max_denominator` defines the denominator for + * limiting from the top for part of the current dirty pages may be spilled + * when the free room for a new dirty pages (i.e. distance to the + * `MDBX_opt_txn_dp_limit` threshold) is not enough to perform requested + * operation. + * Exactly `max_pages_to_spill = dirty_pages - dirty_pages / N`, + * where `N` is the value set by `MDBX_opt_spill_max_denominator`. + * + * Should be in the range 0..255, where zero means no limit, i.e. all dirty + * pages could be spilled. Default is 8, i.e. no more than 7/8 of the current + * dirty pages may be spilled when reached the condition described above. */ + MDBX_opt_spill_max_denominator, + + /** \brief Controls the in-process how minimal part of the dirty pages should + * be spilled when necessary. + * + * \details The `MDBX_opt_spill_min_denominator` defines the denominator for + * limiting from the bottom for part of the current dirty pages should be + * spilled when the free room for a new dirty pages (i.e. distance to the + * `MDBX_opt_txn_dp_limit` threshold) is not enough to perform requested + * operation. + * Exactly `min_pages_to_spill = dirty_pages / N`, + * where `N` is the value set by `MDBX_opt_spill_min_denominator`. + * + * Should be in the range 0..255, where zero means no restriction at the + * bottom. Default is 8, i.e. at least the 1/8 of the current dirty pages + * should be spilled when reached the condition described above. */ + MDBX_opt_spill_min_denominator, + + /** \brief Controls the in-process how much of the parent transaction dirty + * pages will be spilled while start each child transaction. + * + * \details The `MDBX_opt_spill_parent4child_denominator` defines the + * denominator to determine how much of parent transaction dirty pages will be + * spilled explicitly while start each child transaction. + * Exactly `pages_to_spill = dirty_pages / N`, + * where `N` is the value set by `MDBX_opt_spill_parent4child_denominator`. + * + * For a stack of nested transactions each dirty page could be spilled only + * once, and parent's dirty pages couldn't be spilled while child + * transaction(s) are running. Therefore a child transaction could reach + * \ref MDBX_TXN_FULL when parent(s) transaction has spilled too less (and + * child reach the limit of dirty pages), either when parent(s) has spilled + * too more (since child can't spill already spilled pages). So there is no + * universal golden ratio. + * + * Should be in the range 0..255, where zero means no explicit spilling will + * be performed during starting nested transactions. + * Default is 0, i.e. by default no spilling performed during starting nested + * transactions, that correspond historically behaviour. */ + MDBX_opt_spill_parent4child_denominator, + + /** \brief Controls the in-process threshold of semi-empty pages merge. + * \warning This is experimental option and subject for change or removal. + * \details This option controls the in-process threshold of minimum page + * fill, as used space of percentage of a page. Neighbour pages emptier than + * this value are candidates for merging. The threshold value is specified + * in 1/65536 of percent, which is equivalent to the 16-dot-16 fixed point + * format. The specified value must be in the range from 12.5% (almost empty) + * to 50% (half empty) which corresponds to the range from 8192 and to 32768 + * in units respectively. */ + MDBX_opt_merge_threshold_16dot16_percent, +}; +#ifndef __cplusplus +/** \ingroup c_settings */ +typedef enum MDBX_option_t MDBX_option_t; +#endif + +/** \brief Sets the value of a runtime options for an environment. + * \ingroup c_settings + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * \param [in] option The option from \ref MDBX_option_t to set value of it. + * \param [in] value The value of option to be set. + * + * \see MDBX_option_t + * \see mdbx_env_get_option() + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, + const uint64_t value); + +/** \brief Gets the value of runtime options from an environment. + * \ingroup c_settings + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * \param [in] option The option from \ref MDBX_option_t to get value of it. + * \param [out] pvalue The address where the option's value will be stored. + * + * \see MDBX_option_t + * \see mdbx_env_get_option() + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_env_get_option(const MDBX_env *env, + const MDBX_option_t option, + uint64_t *pvalue); + +/** \brief Open an environment instance. + * \ingroup c_opening + * + * Indifferently this function will fails or not, the \ref mdbx_env_close() must + * be called later to discard the \ref MDBX_env handle and release associated + * resources. + * + * \param [in] env An environment handle returned + * by \ref mdbx_env_create() + * + * \param [in] pathname The pathname for the database or the directory in which + * the database files reside. In the case of directory it + * must already exist and be writable. + * + * \param [in] flags Special options for this environment. This parameter + * must be set to 0 or by bitwise OR'ing together one + * or more of the values described above in the + * \ref env_flags and \ref sync_modes sections. + * + * Flags set by mdbx_env_set_flags() are also used: + * - \ref MDBX_NOSUBDIR, \ref MDBX_RDONLY, \ref MDBX_EXCLUSIVE, + * \ref MDBX_WRITEMAP, \ref MDBX_NOTLS, \ref MDBX_NORDAHEAD, + * \ref MDBX_NOMEMINIT, \ref MDBX_COALESCE, \ref MDBX_LIFORECLAIM. + * See \ref env_flags section. + * + * - \ref MDBX_NOMETASYNC, \ref MDBX_SAFE_NOSYNC, \ref MDBX_UTTERLY_NOSYNC. + * See \ref sync_modes section. + * + * \note `MDB_NOLOCK` flag don't supported by MDBX, + * try use \ref MDBX_EXCLUSIVE as a replacement. + * + * \note MDBX don't allow to mix processes with different \ref MDBX_SAFE_NOSYNC + * flags on the same environment. + * In such case \ref MDBX_INCOMPATIBLE will be returned. + * + * If the database is already exist and parameters specified early by + * \ref mdbx_env_set_geometry() are incompatible (i.e. for instance, different + * page size) then \ref mdbx_env_open() will return \ref MDBX_INCOMPATIBLE + * error. + * + * \param [in] mode The UNIX permissions to set on created files. + * Zero value means to open existing, but do not create. + * + * \return A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_VERSION_MISMATCH The version of the MDBX library doesn't match + * the version that created the database environment. + * \retval MDBX_INVALID The environment file headers are corrupted. + * \retval MDBX_ENOENT The directory specified by the path parameter + * doesn't exist. + * \retval MDBX_EACCES The user didn't have permission to access + * the environment files. + * \retval MDBX_EAGAIN The environment was locked by another process. + * \retval MDBX_BUSY The \ref MDBX_EXCLUSIVE flag was specified and the + * environment is in use by another process, + * or the current process tries to open environment + * more than once. + * \retval MDBX_INCOMPATIBLE Environment is already opened by another process, + * but with different set of \ref MDBX_SAFE_NOSYNC, + * \ref MDBX_UTTERLY_NOSYNC flags. + * Or if the database is already exist and parameters + * specified early by \ref mdbx_env_set_geometry() + * are incompatible (i.e. different pagesize, etc). + * + * \retval MDBX_WANNA_RECOVERY The \ref MDBX_RDONLY flag was specified but + * read-write access is required to rollback + * inconsistent state after a system crash. + * + * \retval MDBX_TOO_LARGE Database is too large for this process, + * i.e. 32-bit process tries to open >4Gb database. + */ +LIBMDBX_API int mdbx_env_open(MDBX_env *env, const char *pathname, + MDBX_env_flags_t flags, mdbx_mode_t mode); + +/** \brief Deletion modes for \ref mdbx_env_delete(). + * \ingroup c_extra + * \see mdbx_env_delete() */ +enum MDBX_env_delete_mode_t { + /** \brief Just delete the environment's files and directory if any. + * \note On POSIX systems, processes already working with the database will + * continue to work without interference until it close the environment. + * \note On Windows, the behavior of `MDB_ENV_JUST_DELETE` is different + * because the system does not support deleting files that are currently + * memory mapped. */ + MDBX_ENV_JUST_DELETE = 0, + /** \brief Make sure that the environment is not being used by other + * processes, or return an error otherwise. */ + MDBX_ENV_ENSURE_UNUSED = 1, + /** \brief Wait until other processes closes the environment before deletion. + */ + MDBX_ENV_WAIT_FOR_UNUSED = 2, +}; +#ifndef __cplusplus +/** \ingroup c_extra */ +typedef enum MDBX_env_delete_mode_t MDBX_env_delete_mode_t; +#endif + +/** \brief Delete the environment's files in a proper and multiprocess-safe way. + * \ingroup c_extra + * + * \param [in] pathname The pathname for the database or the directory in which + * the database files reside. + * + * \param [in] mode Special deletion mode for the environment. This + * parameter must be set to one of the values described + * above in the \ref MDBX_env_delete_mode_t section. + * + * \note The \ref MDBX_ENV_JUST_DELETE don't supported on Windows since system + * unable to delete a memory-mapped files. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_RESULT_TRUE No corresponding files or directories were found, + * so no deletion was performed. */ +LIBMDBX_API int mdbx_env_delete(const char *pathname, + MDBX_env_delete_mode_t mode); + +/** \brief Copy an MDBX environment to the specified path, with options. + * \ingroup c_extra + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * \note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under \ref restrictions section. + * + * \param [in] env An environment handle returned by mdbx_env_create(). + * It must have already been opened successfully. + * \param [in] dest The pathname of a file in which the copy will reside. + * This file must not be already exist, but parent directory + * must be writable. + * \param [in] flags Special options for this operation. This parameter must + * be set to 0 or by bitwise OR'ing together one or more + * of the values described here: + * + * - \ref MDBX_CP_COMPACT + * Perform compaction while copying: omit free pages and sequentially + * renumber all pages in output. This option consumes little bit more + * CPU for processing, but may running quickly than the default, on + * account skipping free pages. + * + * - \ref MDBX_CP_FORCE_DYNAMIC_SIZE + * Force to make resizeable copy, i.e. dynamic size instead of fixed. + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_env_copy(MDBX_env *env, const char *dest, + MDBX_copy_flags_t flags); + +/** \brief Copy an environment to the specified file descriptor, with + * options. \ingroup c_extra + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * \see mdbx_env_copy() + * + * \note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under \ref restrictions + * section. + * + * \note Fails if the environment has suffered a page leak and the destination + * file descriptor is associated with a pipe, socket, or FIFO. + * + * \param [in] env An environment handle returned by mdbx_env_create(). + * It must have already been opened successfully. + * \param [in] fd The file descriptor to write the copy to. It must have + * already been opened for Write access. + * \param [in] flags Special options for this operation. \see mdbx_env_copy() + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, + MDBX_copy_flags_t flags); + +/** \brief Statistics for a database in the environment + * \ingroup c_statinfo + * \see mdbx_env_stat_ex() \see mdbx_dbi_stat() */ +struct MDBX_stat { + uint32_t ms_psize; /**< Size of a database page. This is the same for all + databases. */ + uint32_t ms_depth; /**< Depth (height) of the B-tree */ + uint64_t ms_branch_pages; /**< Number of internal (non-leaf) pages */ + uint64_t ms_leaf_pages; /**< Number of leaf pages */ + uint64_t ms_overflow_pages; /**< Number of overflow pages */ + uint64_t ms_entries; /**< Number of data items */ + uint64_t ms_mod_txnid; /**< Transaction ID of committed last modification */ +}; +#ifndef __cplusplus +/** \ingroup c_statinfo */ +typedef struct MDBX_stat MDBX_stat; +#endif + +/** \brief Return statistics about the MDBX environment. + * \ingroup c_statinfo + * + * At least one of env or txn argument must be non-null. If txn is passed + * non-null then stat will be filled accordingly to the given transaction. + * Otherwise, if txn is null, then stat will be populated by a snapshot from + * the last committed write transaction, and at next time, other information + * can be returned. + * + * Legacy mdbx_env_stat() correspond to calling \ref mdbx_env_stat_ex() with the + * null `txn` argument. + * + * \param [in] env An environment handle returned by \ref mdbx_env_create() + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin() + * \param [out] stat The address of an \ref MDBX_stat structure where + * the statistics will be copied + * \param [in] bytes The size of \ref MDBX_stat. + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, + MDBX_stat *stat, size_t bytes); + +/** \brief Return statistics about the MDBX environment. + * \ingroup c_statinfo + * \deprecated Please use mdbx_env_stat_ex() instead. */ +MDBX_DEPRECATED LIBMDBX_INLINE_API(int, mdbx_env_stat, + (const MDBX_env *env, MDBX_stat *stat, + size_t bytes)) { + return mdbx_env_stat_ex(env, NULL, stat, bytes); +} + +/** \brief Information about the environment + * \ingroup c_statinfo + * \see mdbx_env_info_ex() */ +struct MDBX_envinfo { + struct { + uint64_t lower; /**< Lower limit for datafile size */ + uint64_t upper; /**< Upper limit for datafile size */ + uint64_t current; /**< Current datafile size */ + uint64_t shrink; /**< Shrink threshold for datafile */ + uint64_t grow; /**< Growth step for datafile */ + } mi_geo; + uint64_t mi_mapsize; /**< Size of the data memory map */ + uint64_t mi_last_pgno; /**< Number of the last used page */ + uint64_t mi_recent_txnid; /**< ID of the last committed transaction */ + uint64_t mi_latter_reader_txnid; /**< ID of the last reader transaction */ + uint64_t mi_self_latter_reader_txnid; /**< ID of the last reader transaction + of caller process */ + uint64_t mi_meta0_txnid, mi_meta0_sign; + uint64_t mi_meta1_txnid, mi_meta1_sign; + uint64_t mi_meta2_txnid, mi_meta2_sign; + uint32_t mi_maxreaders; /**< Total reader slots in the environment */ + uint32_t mi_numreaders; /**< Max reader slots used in the environment */ + uint32_t mi_dxb_pagesize; /**< Database pagesize */ + uint32_t mi_sys_pagesize; /**< System pagesize */ + + /** \brief A mostly unique ID that is regenerated on each boot. + + As such it can be used to identify the local machine's current boot. MDBX + uses such when open the database to determine whether rollback required to + the last steady sync point or not. I.e. if current bootid is differ from the + value within a database then the system was rebooted and all changes since + last steady sync must be reverted for data integrity. Zeros mean that no + relevant information is available from the system. */ + struct { + struct { + uint64_t x, y; + } current, meta0, meta1, meta2; + } mi_bootid; + + /** Bytes not explicitly synchronized to disk */ + uint64_t mi_unsync_volume; + /** Current auto-sync threshold, see \ref mdbx_env_set_syncbytes(). */ + uint64_t mi_autosync_threshold; + /** Time since the last steady sync in 1/65536 of second */ + uint32_t mi_since_sync_seconds16dot16; + /** Current auto-sync period in 1/65536 of second, + * see \ref mdbx_env_set_syncperiod(). */ + uint32_t mi_autosync_period_seconds16dot16; + /** Time since the last readers check in 1/65536 of second, + * see \ref mdbx_reader_check(). */ + uint32_t mi_since_reader_check_seconds16dot16; + /** Current environment mode. + * The same as \ref mdbx_env_get_flags() returns. */ + uint32_t mi_mode; + + /** Statistics of page operations. + * \details Overall statistics of page operations of all (running, completed + * and aborted) transactions in the current multi-process session (since the + * first process opened the database after everyone had previously closed it). + */ + struct { + uint64_t newly; /**< Quantity of a new pages added */ + uint64_t cow; /**< Quantity of pages copied for update */ + uint64_t clone; /**< Quantity of parent's dirty pages clones + for nested transactions */ + uint64_t split; /**< Page splits */ + uint64_t merge; /**< Page merges */ + uint64_t spill; /**< Quantity of spilled dirty pages */ + uint64_t unspill; /**< Quantity of unspilled/reloaded pages */ + uint64_t wops; /**< Number of explicit write operations (not a pages) + to a disk */ + } mi_pgop_stat; +}; +#ifndef __cplusplus +/** \ingroup c_statinfo */ +typedef struct MDBX_envinfo MDBX_envinfo; +#endif + +/** \brief Return information about the MDBX environment. + * \ingroup c_statinfo + * + * At least one of env or txn argument must be non-null. If txn is passed + * non-null then stat will be filled accordingly to the given transaction. + * Otherwise, if txn is null, then stat will be populated by a snapshot from + * the last committed write transaction, and at next time, other information + * can be returned. + * + * Legacy \ref mdbx_env_info() correspond to calling \ref mdbx_env_info_ex() + * with the null `txn` argument. + * + * \param [in] env An environment handle returned by \ref mdbx_env_create() + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin() + * \param [out] info The address of an \ref MDBX_envinfo structure + * where the information will be copied + * \param [in] bytes The size of \ref MDBX_envinfo. + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, + MDBX_envinfo *info, size_t bytes); +/** \brief Return information about the MDBX environment. + * \ingroup c_statinfo + * \deprecated Please use mdbx_env_info_ex() instead. */ +MDBX_DEPRECATED LIBMDBX_INLINE_API(int, mdbx_env_info, + (const MDBX_env *env, MDBX_envinfo *info, + size_t bytes)) { + return mdbx_env_info_ex(env, NULL, info, bytes); +} + +/** \brief Flush the environment data buffers to disk. + * \ingroup c_extra + * + * Unless the environment was opened with no-sync flags (\ref MDBX_NOMETASYNC, + * \ref MDBX_SAFE_NOSYNC and \ref MDBX_UTTERLY_NOSYNC), then + * data is always written an flushed to disk when \ref mdbx_txn_commit() is + * called. Otherwise \ref mdbx_env_sync() may be called to manually write and + * flush unsynced data to disk. + * + * Besides, \ref mdbx_env_sync_ex() with argument `force=false` may be used to + * provide polling mode for lazy/asynchronous sync in conjunction with + * \ref mdbx_env_set_syncbytes() and/or \ref mdbx_env_set_syncperiod(). + * + * \note This call is not valid if the environment was opened with MDBX_RDONLY. + * + * \param [in] env An environment handle returned by \ref mdbx_env_create() + * \param [in] force If non-zero, force a flush. Otherwise, If force is + * zero, then will run in polling mode, + * i.e. it will check the thresholds that were + * set \ref mdbx_env_set_syncbytes() + * and/or \ref mdbx_env_set_syncperiod() and perform flush + * if at least one of the thresholds is reached. + * + * \param [in] nonblock Don't wait if write transaction + * is running by other thread. + * + * \returns A non-zero error value on failure and \ref MDBX_RESULT_TRUE or 0 on + * success. The \ref MDBX_RESULT_TRUE means no data pending for flush + * to disk, and 0 otherwise. Some possible errors are: + * + * \retval MDBX_EACCES the environment is read-only. + * \retval MDBX_BUSY the environment is used by other thread + * and `nonblock=true`. + * \retval MDBX_EINVAL an invalid parameter was specified. + * \retval MDBX_EIO an error occurred during synchronization. */ +LIBMDBX_API int mdbx_env_sync_ex(MDBX_env *env, bool force, bool nonblock); + +/** \brief The shortcut to calling \ref mdbx_env_sync_ex() with + * the `force=true` and `nonblock=false` arguments. + * \ingroup c_extra */ +LIBMDBX_INLINE_API(int, mdbx_env_sync, (MDBX_env * env)) { + return mdbx_env_sync_ex(env, true, false); +} + +/** \brief The shortcut to calling \ref mdbx_env_sync_ex() with + * the `force=false` and `nonblock=true` arguments. + * \ingroup c_extra */ +LIBMDBX_INLINE_API(int, mdbx_env_sync_poll, (MDBX_env * env)) { + return mdbx_env_sync_ex(env, false, true); +} + +/** \brief Sets threshold to force flush the data buffers to disk, even any of + * \ref MDBX_SAFE_NOSYNC flag in the environment. + * \ingroup c_settings + * + * The threshold value affects all processes which operates with given + * environment until the last process close environment or a new value will be + * settled. + * + * Data is always written to disk when \ref mdbx_txn_commit() is called, but + * the operating system may keep it buffered. MDBX always flushes the OS buffers + * upon commit as well, unless the environment was opened with + * \ref MDBX_SAFE_NOSYNC, \ref MDBX_UTTERLY_NOSYNC + * or in part \ref MDBX_NOMETASYNC. + * + * The default is 0, than mean no any threshold checked, and no additional + * flush will be made. + * + * \param [in] env An environment handle returned by mdbx_env_create(). + * \param [in] threshold The size in bytes of summary changes when + * a synchronous flush would be made. + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_INLINE_API(int, mdbx_env_set_syncbytes, + (MDBX_env * env, size_t threshold)) { + return mdbx_env_set_option(env, MDBX_opt_sync_bytes, threshold); +} + +/** \brief Sets relative period since the last unsteady commit to force flush + * the data buffers to disk, even of \ref MDBX_SAFE_NOSYNC flag in the + * environment. + * + * \ingroup c_settings + * + * The relative period value affects all processes which operates with given + * environment until the last process close environment or a new value will be + * settled. + * + * Data is always written to disk when \ref mdbx_txn_commit() is called, but the + * operating system may keep it buffered. MDBX always flushes the OS buffers + * upon commit as well, unless the environment was opened with + * \ref MDBX_SAFE_NOSYNC or in part \ref MDBX_NOMETASYNC. + * + * Settled period don't checked asynchronously, but only by the + * \ref mdbx_txn_commit() and \ref mdbx_env_sync() functions. Therefore, in + * cases where transactions are committed infrequently and/or irregularly, + * polling by \ref mdbx_env_sync() may be a reasonable solution to timeout + * enforcement. + * + * The default is 0, than mean no any timeout checked, and no additional + * flush will be made. + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * \param [in] seconds_16dot16 The period in 1/65536 of second when + * a synchronous flush would be made since + * the last unsteady commit. + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_INLINE_API(int, mdbx_env_set_syncperiod, + (MDBX_env * env, unsigned seconds_16dot16)) { + return mdbx_env_set_option(env, MDBX_opt_sync_period, seconds_16dot16); +} + +/** \brief Close the environment and release the memory map. + * \ingroup c_opening + * + * Only a single thread may call this function. All transactions, databases, + * and cursors must already be closed before calling this function. Attempts + * to use any such handles after calling this function will cause a `SIGSEGV`. + * The environment handle will be freed and must not be used again after this + * call. + * + * \param [in] env An environment handle returned by + * \ref mdbx_env_create(). + * + * \param [in] dont_sync A dont'sync flag, if non-zero the last checkpoint + * will be kept "as is" and may be still "weak" in the + * \ref MDBX_SAFE_NOSYNC or \ref MDBX_UTTERLY_NOSYNC + * modes. Such "weak" checkpoint will be ignored on + * opening next time, and transactions since the last + * non-weak checkpoint (meta-page update) will rolledback + * for consistency guarantee. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_BUSY The write transaction is running by other thread, + * in such case \ref MDBX_env instance has NOT be destroyed + * not released! + * \note If any OTHER error code was returned then + * given MDBX_env instance has been destroyed and released. + * + * \retval MDBX_EBADSIGN Environment handle already closed or not valid, + * i.e. \ref mdbx_env_close() was already called for the + * `env` or was not created by \ref mdbx_env_create(). + * + * \retval MDBX_PANIC If \ref mdbx_env_close_ex() was called in the child + * process after `fork()`. In this case \ref MDBX_PANIC + * is expected, i.e. \ref MDBX_env instance was freed in + * proper manner. + * + * \retval MDBX_EIO An error occurred during synchronization. */ +LIBMDBX_API int mdbx_env_close_ex(MDBX_env *env, bool dont_sync); + +/** \brief The shortcut to calling \ref mdbx_env_close_ex() with + * the `dont_sync=false` argument. + * \ingroup c_opening */ +LIBMDBX_INLINE_API(int, mdbx_env_close, (MDBX_env * env)) { + return mdbx_env_close_ex(env, false); +} + +/** \brief Set environment flags. + * \ingroup c_settings + * + * This may be used to set some flags in addition to those from + * mdbx_env_open(), or to unset these flags. + * \see mdbx_env_get_flags() + * + * \note In contrast to LMDB, the MDBX serialize threads via mutex while + * changing the flags. Therefore this function will be blocked while a write + * transaction running by other thread, or \ref MDBX_BUSY will be returned if + * function called within a write transaction. + * + * \param [in] env An environment handle returned + * by \ref mdbx_env_create(). + * \param [in] flags The \ref env_flags to change, bitwise OR'ed together. + * \param [in] onoff A non-zero value sets the flags, zero clears them. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, + bool onoff); + +/** \brief Get environment flags. + * \ingroup c_statinfo + * \see mdbx_env_set_flags() + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * \param [out] flags The address of an integer to store the flags. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_env_get_flags(const MDBX_env *env, unsigned *flags); + +/** \brief Return the path that was used in mdbx_env_open(). + * \ingroup c_statinfo + * + * \param [in] env An environment handle returned by \ref mdbx_env_create() + * \param [out] dest Address of a string pointer to contain the path. + * This is the actual string in the environment, not a + * copy. It should not be altered in any way. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_env_get_path(const MDBX_env *env, const char **dest); + +/** \brief Return the file descriptor for the given environment. + * \ingroup c_statinfo + * + * \note All MDBX file descriptors have `FD_CLOEXEC` and + * couldn't be used after exec() and or `fork()`. + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * \param [out] fd Address of a int to contain the descriptor. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *fd); + +/** \brief Set all size-related parameters of environment, including page size + * and the min/max size of the memory map. \ingroup c_settings + * + * In contrast to LMDB, the MDBX provide automatic size management of an + * database according the given parameters, including shrinking and resizing + * on the fly. From user point of view all of these just working. Nevertheless, + * it is reasonable to know some details in order to make optimal decisions + * when choosing parameters. + * + * Both \ref mdbx_env_info_ex() and legacy \ref mdbx_env_info() are inapplicable + * to read-only opened environment. + * + * Both \ref mdbx_env_info_ex() and legacy \ref mdbx_env_info() could be called + * either before or after \ref mdbx_env_open(), either within the write + * transaction running by current thread or not: + * + * - In case \ref mdbx_env_info_ex() or legacy \ref mdbx_env_info() was called + * BEFORE \ref mdbx_env_open(), i.e. for closed environment, then the + * specified parameters will be used for new database creation, or will be + * applied during opening if database exists and no other process using it. + * + * If the database is already exist, opened with \ref MDBX_EXCLUSIVE or not + * used by any other process, and parameters specified by + * \ref mdbx_env_set_geometry() are incompatible (i.e. for instance, + * different page size) then \ref mdbx_env_open() will return + * \ref MDBX_INCOMPATIBLE error. + * + * In another way, if database will opened read-only or will used by other + * process during calling \ref mdbx_env_open() that specified parameters will + * silently discarded (open the database with \ref MDBX_EXCLUSIVE flag + * to avoid this). + * + * - In case \ref mdbx_env_info_ex() or legacy \ref mdbx_env_info() was called + * after \ref mdbx_env_open() WITHIN the write transaction running by current + * thread, then specified parameters will be applied as a part of write + * transaction, i.e. will not be visible to any others processes until the + * current write transaction has been committed by the current process. + * However, if transaction will be aborted, then the database file will be + * reverted to the previous size not immediately, but when a next transaction + * will be committed or when the database will be opened next time. + * + * - In case \ref mdbx_env_info_ex() or legacy \ref mdbx_env_info() was called + * after \ref mdbx_env_open() but OUTSIDE a write transaction, then MDBX will + * execute internal pseudo-transaction to apply new parameters (but only if + * anything has been changed), and changes be visible to any others processes + * immediately after succesful completion of function. + * + * Essentially a concept of "automatic size management" is simple and useful: + * - There are the lower and upper bound of the database file size; + * - There is the growth step by which the database file will be increased, + * in case of lack of space. + * - There is the threshold for unused space, beyond which the database file + * will be shrunk. + * - The size of the memory map is also the maximum size of the database. + * - MDBX will automatically manage both the size of the database and the size + * of memory map, according to the given parameters. + * + * So, there some considerations about choosing these parameters: + * - The lower bound allows you to prevent database shrinking below some + * rational size to avoid unnecessary resizing costs. + * - The upper bound allows you to prevent database growth above some rational + * size. Besides, the upper bound defines the linear address space + * reservation in each process that opens the database. Therefore changing + * the upper bound is costly and may be required reopening environment in + * case of \ref MDBX_UNABLE_EXTEND_MAPSIZE errors, and so on. Therefore, this + * value should be chosen reasonable as large as possible, to accommodate + * future growth of the database. + * - The growth step must be greater than zero to allow the database to grow, + * but also reasonable not too small, since increasing the size by little + * steps will result a large overhead. + * - The shrink threshold must be greater than zero to allow the database + * to shrink but also reasonable not too small (to avoid extra overhead) and + * not less than growth step to avoid up-and-down flouncing. + * - The current size (i.e. size_now argument) is an auxiliary parameter for + * simulation legacy \ref mdbx_env_set_mapsize() and as workaround Windows + * issues (see below). + * + * Unfortunately, Windows has is a several issues + * with resizing of memory-mapped file: + * - Windows unable shrinking a memory-mapped file (i.e memory-mapped section) + * in any way except unmapping file entirely and then map again. Moreover, + * it is impossible in any way if a memory-mapped file is used more than + * one process. + * - Windows does not provide the usual API to augment a memory-mapped file + * (that is, a memory-mapped partition), but only by using "Native API" + * in an undocumented way. + * + * MDBX bypasses all Windows issues, but at a cost: + * - Ability to resize database on the fly requires an additional lock + * and release `SlimReadWriteLock during` each read-only transaction. + * - During resize all in-process threads should be paused and then resumed. + * - Shrinking of database file is performed only when it used by single + * process, i.e. when a database closes by the last process or opened + * by the first. + * = Therefore, the size_now argument may be useful to set database size + * by the first process which open a database, and thus avoid expensive + * remapping further. + * + * For create a new database with particular parameters, including the page + * size, \ref mdbx_env_set_geometry() should be called after + * \ref mdbx_env_create() and before mdbx_env_open(). Once the database is + * created, the page size cannot be changed. If you do not specify all or some + * of the parameters, the corresponding default values will be used. For + * instance, the default for database size is 10485760 bytes. + * + * If the mapsize is increased by another process, MDBX silently and + * transparently adopt these changes at next transaction start. However, + * \ref mdbx_txn_begin() will return \ref MDBX_UNABLE_EXTEND_MAPSIZE if new + * mapping size could not be applied for current process (for instance if + * address space is busy). Therefore, in the case of + * \ref MDBX_UNABLE_EXTEND_MAPSIZE error you need close and reopen the + * environment to resolve error. + * + * \note Actual values may be different than your have specified because of + * rounding to specified database page size, the system page size and/or the + * size of the system virtual memory management unit. You can get actual values + * by \ref mdbx_env_sync_ex() or see by using the tool `mdbx_chk` with the `-v` + * option. + * + * Legacy \ref mdbx_env_set_mapsize() correspond to calling + * \ref mdbx_env_set_geometry() with the arguments `size_lower`, `size_now`, + * `size_upper` equal to the `size` and `-1` (i.e. default) for all other + * parameters. + * + * \param [in] env An environment handle returned + * by \ref mdbx_env_create() + * + * \param [in] size_lower The lower bound of database size in bytes. + * Zero value means "minimal acceptable", + * and negative means "keep current or use default". + * + * \param [in] size_now The size in bytes to setup the database size for + * now. Zero value means "minimal acceptable", and + * negative means "keep current or use default". So, + * it is recommended always pass -1 in this argument + * except some special cases. + * + * \param [in] size_upper The upper bound of database size in bytes. + * Zero value means "minimal acceptable", + * and negative means "keep current or use default". + * It is recommended to avoid change upper bound while + * database is used by other processes or threaded + * (i.e. just pass -1 in this argument except absolutely + * necessary). Otherwise you must be ready for + * \ref MDBX_UNABLE_EXTEND_MAPSIZE error(s), unexpected + * pauses during remapping and/or system errors like + * "address busy", and so on. In other words, there + * is no way to handle a growth of the upper bound + * robustly because there may be a lack of appropriate + * system resources (which are extremely volatile in + * a multi-process multi-threaded environment). + * + * \param [in] growth_step The growth step in bytes, must be greater than + * zero to allow the database to grow. Negative value + * means "keep current or use default". + * + * \param [in] shrink_threshold The shrink threshold in bytes, must be greater + * than zero to allow the database to shrink and + * greater than growth_step to avoid shrinking + * right after grow. + * Negative value means "keep current + * or use default". Default is 2*growth_step. + * + * \param [in] pagesize The database page size for new database + * creation or -1 otherwise. Must be power of 2 + * in the range between \ref MDBX_MIN_PAGESIZE and + * \ref MDBX_MAX_PAGESIZE. Zero value means + * "minimal acceptable", and negative means + * "keep current or use default". + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_EINVAL An invalid parameter was specified, + * or the environment has an active write transaction. + * \retval MDBX_EPERM Specific for Windows: Shrinking was disabled before + * and now it wanna be enabled, but there are reading + * threads that don't use the additional `SRWL` (that + * is required to avoid Windows issues). + * \retval MDBX_EACCESS The environment opened in read-only. + * \retval MDBX_MAP_FULL Specified size smaller than the space already + * consumed by the environment. + * \retval MDBX_TOO_LARGE Specified size is too large, i.e. too many pages for + * given size, or a 32-bit process requests too much + * bytes for the 32-bit address space. */ +LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, + intptr_t size_now, intptr_t size_upper, + intptr_t growth_step, + intptr_t shrink_threshold, + intptr_t pagesize); + +/** \deprecated Please use \ref mdbx_env_set_geometry() instead. + * \ingroup c_settings */ +MDBX_DEPRECATED LIBMDBX_INLINE_API(int, mdbx_env_set_mapsize, + (MDBX_env * env, size_t size)) { + return mdbx_env_set_geometry(env, size, size, size, -1, -1, -1); +} + +/** \brief Find out whether to use readahead or not, based on the given database + * size and the amount of available memory. \ingroup c_extra + * + * \param [in] volume The expected database size in bytes. + * \param [in] redundancy Additional reserve or overload in case of negative + * value. + * + * \returns A \ref MDBX_RESULT_TRUE or \ref MDBX_RESULT_FALSE value, + * otherwise the error code: + * \retval MDBX_RESULT_TRUE Readahead is reasonable. + * \retval MDBX_RESULT_FALSE Readahead is NOT reasonable, + * i.e. \ref MDBX_NORDAHEAD is useful to + * open environment by \ref mdbx_env_open(). + * \retval Otherwise the error code. */ +LIBMDBX_API int mdbx_is_readahead_reasonable(size_t volume, + intptr_t redundancy); + +/** \brief Returns the minimal database page size in bytes. + * \ingroup c_statinfo */ +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_INLINE_API(intptr_t, mdbx_limits_pgsize_min, + (void)) { + return MDBX_MIN_PAGESIZE; +} + +/** \brief Returns the maximal database page size in bytes. + * \ingroup c_statinfo */ +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_INLINE_API(intptr_t, mdbx_limits_pgsize_max, + (void)) { + return MDBX_MAX_PAGESIZE; +} + +/** \brief Returns minimal database size in bytes for given page size, + * or -1 if pagesize is invalid. + * \ingroup c_statinfo */ +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t +mdbx_limits_dbsize_min(intptr_t pagesize); + +/** \brief Returns maximal database size in bytes for given page size, + * or -1 if pagesize is invalid. + * \ingroup c_statinfo */ +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t +mdbx_limits_dbsize_max(intptr_t pagesize); + +/** \brief Returns maximal key size in bytes for given page size + * and database flags, or -1 if pagesize is invalid. + * \ingroup c_statinfo + * \see db_flags */ +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t +mdbx_limits_keysize_max(intptr_t pagesize, MDBX_db_flags_t flags); + +/** \brief Returns maximal data size in bytes for given page size + * and database flags, or -1 if pagesize is invalid. + * \ingroup c_statinfo + * \see db_flags */ +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t +mdbx_limits_valsize_max(intptr_t pagesize, MDBX_db_flags_t flags); + +/** \brief Returns maximal write transaction size (i.e. limit for summary volume + * of dirty pages) in bytes for given page size, or -1 if pagesize is invalid. + * \ingroup c_statinfo */ +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t +mdbx_limits_txnsize_max(intptr_t pagesize); + +/** \brief Set the maximum number of threads/reader slots for for all processes + * interacts with the database. \ingroup c_settings + * + * \details This defines the number of slots in the lock table that is used to + * track readers in the the environment. The default is about 100 for 4K system + * page size. Starting a read-only transaction normally ties a lock table slot + * to the current thread until the environment closes or the thread exits. If + * \ref MDBX_NOTLS is in use, \ref mdbx_txn_begin() instead ties the slot to the + * \ref MDBX_txn object until it or the \ref MDBX_env object is destroyed. + * This function may only be called after \ref mdbx_env_create() and before + * \ref mdbx_env_open(), and has an effect only when the database is opened by + * the first process interacts with the database. + * \see mdbx_env_get_maxreaders() + * + * \param [in] env An environment handle returned + * by \ref mdbx_env_create(). + * \param [in] readers The maximum number of reader lock table slots. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_EINVAL An invalid parameter was specified. + * \retval MDBX_EPERM The environment is already open. */ +LIBMDBX_INLINE_API(int, mdbx_env_set_maxreaders, + (MDBX_env * env, unsigned readers)) { + return mdbx_env_set_option(env, MDBX_opt_max_readers, readers); +} + +/** \brief Get the maximum number of threads/reader slots for the environment. + * \ingroup c_statinfo + * \see mdbx_env_set_maxreaders() + * + * \param [in] env An environment handle returned + * by \ref mdbx_env_create(). + * \param [out] readers Address of an integer to store the number of readers. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_INLINE_API(int, mdbx_env_get_maxreaders, + (const MDBX_env *env, unsigned *readers)) { + int rc = MDBX_EINVAL; + if (readers) { + uint64_t proxy = 0; + rc = mdbx_env_get_option(env, MDBX_opt_max_readers, &proxy); + *readers = (unsigned)proxy; + } + return rc; +} + +/** \brief Set the maximum number of named databases for the environment. + * \ingroup c_settings + * + * This function is only needed if multiple databases will be used in the + * environment. Simpler applications that use the environment as a single + * unnamed database can ignore this option. + * This function may only be called after \ref mdbx_env_create() and before + * \ref mdbx_env_open(). + * + * Currently a moderate number of slots are cheap but a huge number gets + * expensive: 7-120 words per transaction, and every \ref mdbx_dbi_open() + * does a linear search of the opened slots. + * \see mdbx_env_get_maxdbs() + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * \param [in] dbs The maximum number of databases. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_EINVAL An invalid parameter was specified. + * \retval MDBX_EPERM The environment is already open. */ +LIBMDBX_INLINE_API(int, mdbx_env_set_maxdbs, (MDBX_env * env, MDBX_dbi dbs)) { + return mdbx_env_set_option(env, MDBX_opt_max_db, dbs); +} + +/** \brief Get the maximum number of named databases for the environment. + * \ingroup c_statinfo + * \see mdbx_env_set_maxdbs() + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * \param [out] dbs Address to store the maximum number of databases. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_INLINE_API(int, mdbx_env_get_maxdbs, + (const MDBX_env *env, MDBX_dbi *dbs)) { + int rc = MDBX_EINVAL; + if (dbs) { + uint64_t proxy = 0; + rc = mdbx_env_get_option(env, MDBX_opt_max_db, &proxy); + *dbs = (MDBX_dbi)proxy; + } + return rc; +} + +/** \brief Returns the default size of database page for the current system. + * \ingroup c_statinfo + * \details Default size of database page depends on the size of the system + * page and usually exactly match it. */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API size_t mdbx_default_pagesize(void); + +/** \brief Returns basic information about system RAM. + * This function provides a portable way to get information about available RAM + * and can be useful in that it returns the same information that libmdbx uses + * internally to adjust various options and control readahead. + * \ingroup c_statinfo + * + * \param [out] page_size Optional address where the system page size + * will be stored. + * \param [out] total_pages Optional address where the number of total RAM + * pages will be stored. + * \param [out] avail_pages Optional address where the number of + * available/free RAM pages will be stored. + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages, + intptr_t *avail_pages); + +/** \brief Returns the maximum size of keys can put. + * \ingroup c_statinfo + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * \param [in] flags Database options (\ref MDBX_DUPSORT, \ref MDBX_INTEGERKEY + * and so on). \see db_flags + * + * \returns The maximum size of a key can write, + * or -1 if something is wrong. */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int +mdbx_env_get_maxkeysize_ex(const MDBX_env *env, MDBX_db_flags_t flags); + +/** \brief Returns the maximum size of data we can put. + * \ingroup c_statinfo + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * \param [in] flags Database options (\ref MDBX_DUPSORT, \ref MDBX_INTEGERKEY + * and so on). \see db_flags + * + * \returns The maximum size of a data can write, + * or -1 if something is wrong. */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int +mdbx_env_get_maxvalsize_ex(const MDBX_env *env, MDBX_db_flags_t flags); + +/** \deprecated Please use \ref mdbx_env_get_maxkeysize_ex() + * and/or \ref mdbx_env_get_maxvalsize_ex() + * \ingroup c_statinfo */ +MDBX_NOTHROW_PURE_FUNCTION MDBX_DEPRECATED LIBMDBX_API int +mdbx_env_get_maxkeysize(const MDBX_env *env); + +/** \brief Sets application information (a context pointer) associated with + * the environment. + * \see mdbx_env_get_userctx() + * \ingroup c_settings + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * \param [in] ctx An arbitrary pointer for whatever the application needs. + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_env_set_userctx(MDBX_env *env, void *ctx); + +/** \brief Returns an application information (a context pointer) associated + * with the environment. + * \see mdbx_env_set_userctx() + * \ingroup c_statinfo + * + * \param [in] env An environment handle returned by \ref mdbx_env_create() + * \returns The pointer set by \ref mdbx_env_set_userctx() + * or `NULL` if something wrong. */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API void * +mdbx_env_get_userctx(const MDBX_env *env); + +/** \brief Create a transaction with a user provided context pointer + * for use with the environment. + * \ingroup c_transactions + * + * The transaction handle may be discarded using \ref mdbx_txn_abort() + * or \ref mdbx_txn_commit(). + * \see mdbx_txn_begin() + * + * \note A transaction and its cursors must only be used by a single thread, + * and a thread may only have a single transaction at a time. If \ref MDBX_NOTLS + * is in use, this does not apply to read-only transactions. + * + * \note Cursors may not span transactions. + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * + * \param [in] parent If this parameter is non-NULL, the new transaction will + * be a nested transaction, with the transaction indicated + * by parent as its parent. Transactions may be nested + * to any level. A parent transaction and its cursors may + * not issue any other operations than mdbx_txn_commit and + * \ref mdbx_txn_abort() while it has active child + * transactions. + * + * \param [in] flags Special options for this transaction. This parameter + * must be set to 0 or by bitwise OR'ing together one + * or more of the values described here: + * - \ref MDBX_RDONLY This transaction will not perform + * any write operations. + * + * - \ref MDBX_TXN_TRY Do not block when starting + * a write transaction. + * + * - \ref MDBX_SAFE_NOSYNC, \ref MDBX_NOMETASYNC. + * Do not sync data to disk corresponding + * to \ref MDBX_NOMETASYNC or \ref MDBX_SAFE_NOSYNC + * description. \see sync_modes + * + * \param [out] txn Address where the new \ref MDBX_txn handle + * will be stored. + * + * \param [in] context A pointer to application context to be associated with + * created transaction and could be retrieved by + * \ref mdbx_txn_get_userctx() until transaction finished. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_PANIC A fatal error occurred earlier and the + * environment must be shut down. + * \retval MDBX_UNABLE_EXTEND_MAPSIZE Another process wrote data beyond + * this MDBX_env's mapsize and this + * environment map must be resized as well. + * See \ref mdbx_env_set_mapsize(). + * \retval MDBX_READERS_FULL A read-only transaction was requested and + * the reader lock table is full. + * See \ref mdbx_env_set_maxreaders(). + * \retval MDBX_ENOMEM Out of memory. + * \retval MDBX_BUSY The write transaction is already started by the + * current thread. */ +LIBMDBX_API int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, + MDBX_txn_flags_t flags, MDBX_txn **txn, + void *context); + +/** \brief Create a transaction for use with the environment. + * \ingroup c_transactions + * + * The transaction handle may be discarded using \ref mdbx_txn_abort() + * or \ref mdbx_txn_commit(). + * \see mdbx_txn_begin_ex() + * + * \note A transaction and its cursors must only be used by a single thread, + * and a thread may only have a single transaction at a time. If \ref MDBX_NOTLS + * is in use, this does not apply to read-only transactions. + * + * \note Cursors may not span transactions. + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * + * \param [in] parent If this parameter is non-NULL, the new transaction will + * be a nested transaction, with the transaction indicated + * by parent as its parent. Transactions may be nested + * to any level. A parent transaction and its cursors may + * not issue any other operations than mdbx_txn_commit and + * \ref mdbx_txn_abort() while it has active child + * transactions. + * + * \param [in] flags Special options for this transaction. This parameter + * must be set to 0 or by bitwise OR'ing together one + * or more of the values described here: + * - \ref MDBX_RDONLY This transaction will not perform + * any write operations. + * + * - \ref MDBX_TXN_TRY Do not block when starting + * a write transaction. + * + * - \ref MDBX_SAFE_NOSYNC, \ref MDBX_NOMETASYNC. + * Do not sync data to disk corresponding + * to \ref MDBX_NOMETASYNC or \ref MDBX_SAFE_NOSYNC + * description. \see sync_modes + * + * \param [out] txn Address where the new \ref MDBX_txn handle + * will be stored. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_PANIC A fatal error occurred earlier and the + * environment must be shut down. + * \retval MDBX_UNABLE_EXTEND_MAPSIZE Another process wrote data beyond + * this MDBX_env's mapsize and this + * environment map must be resized as well. + * See \ref mdbx_env_set_mapsize(). + * \retval MDBX_READERS_FULL A read-only transaction was requested and + * the reader lock table is full. + * See \ref mdbx_env_set_maxreaders(). + * \retval MDBX_ENOMEM Out of memory. + * \retval MDBX_BUSY The write transaction is already started by the + * current thread. */ +LIBMDBX_INLINE_API(int, mdbx_txn_begin, + (MDBX_env * env, MDBX_txn *parent, MDBX_txn_flags_t flags, + MDBX_txn **txn)) { + return mdbx_txn_begin_ex(env, parent, flags, txn, NULL); +} + +/** \brief Sets application information associated (a context pointer) with the + * transaction. + * \ingroup c_transactions + * \see mdbx_txn_get_userctx() + * + * \param [in] txn An transaction handle returned by \ref mdbx_txn_begin_ex() + * or \ref mdbx_txn_begin(). + * \param [in] ctx An arbitrary pointer for whatever the application needs. + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_txn_set_userctx(MDBX_txn *txn, void *ctx); + +/** \brief Returns an application information (a context pointer) associated + * with the transaction. + * \ingroup c_transactions + * \see mdbx_txn_set_userctx() + * + * \param [in] txn An transaction handle returned by \ref mdbx_txn_begin_ex() + * or \ref mdbx_txn_begin(). + * \returns The pointer which was passed via the `context` parameter + * of `mdbx_txn_begin_ex()` or set by \ref mdbx_txn_set_userctx(), + * or `NULL` if something wrong. */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API void * +mdbx_txn_get_userctx(const MDBX_txn *txn); + +/** \brief Information about the transaction + * \ingroup c_statinfo + * \see mdbx_txn_info */ +struct MDBX_txn_info { + /** The ID of the transaction. For a READ-ONLY transaction, this corresponds + to the snapshot being read. */ + uint64_t txn_id; + + /** For READ-ONLY transaction: the lag from a recent MVCC-snapshot, i.e. the + number of committed transaction since read transaction started. + For WRITE transaction (provided if `scan_rlt=true`): the lag of the oldest + reader from current transaction (i.e. at least 1 if any reader running). */ + uint64_t txn_reader_lag; + + /** Used space by this transaction, i.e. corresponding to the last used + * database page. */ + uint64_t txn_space_used; + + /** Current size of database file. */ + uint64_t txn_space_limit_soft; + + /** Upper bound for size the database file, i.e. the value `size_upper` + argument of the appropriate call of \ref mdbx_env_set_geometry(). */ + uint64_t txn_space_limit_hard; + + /** For READ-ONLY transaction: The total size of the database pages that were + retired by committed write transactions after the reader's MVCC-snapshot, + i.e. the space which would be freed after the Reader releases the + MVCC-snapshot for reuse by completion read transaction. + For WRITE transaction: The summarized size of the database pages that were + retired for now due Copy-On-Write during this transaction. */ + uint64_t txn_space_retired; + + /** For READ-ONLY transaction: the space available for writer(s) and that + must be exhausted for reason to call the Handle-Slow-Readers callback for + this read transaction. + For WRITE transaction: the space inside transaction + that left to `MDBX_TXN_FULL` error. */ + uint64_t txn_space_leftover; + + /** For READ-ONLY transaction (provided if `scan_rlt=true`): The space that + actually become available for reuse when only this transaction will be + finished. + For WRITE transaction: The summarized size of the dirty database + pages that generated during this transaction. */ + uint64_t txn_space_dirty; +}; +#ifndef __cplusplus +/** \ingroup c_statinfo */ +typedef struct MDBX_txn_info MDBX_txn_info; +#endif + +/** \brief Return information about the MDBX transaction. + * \ingroup c_statinfo + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin() + * \param [out] info The address of an \ref MDBX_txn_info structure + * where the information will be copied. + * \param [in] scan_rlt The boolean flag controls the scan of the read lock + * table to provide complete information. Such scan + * is relatively expensive and you can avoid it + * if corresponding fields are not needed. + * See description of \ref MDBX_txn_info. + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, + bool scan_rlt); + +/** \brief Returns the transaction's MDBX_env. + * \ingroup c_transactions + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin() */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API MDBX_env * +mdbx_txn_env(const MDBX_txn *txn); + +/** \brief Return the transaction's flags. + * \ingroup c_transactions + * + * This returns the flags associated with this transaction. + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * + * \returns A transaction flags, valid if input is an valid transaction, + * otherwise -1. */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int mdbx_txn_flags(const MDBX_txn *txn); + +/** \brief Return the transaction's ID. + * \ingroup c_statinfo + * + * This returns the identifier associated with this transaction. For a + * read-only transaction, this corresponds to the snapshot being read; + * concurrent readers will frequently have the same transaction ID. + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * + * \returns A transaction ID, valid if input is an active transaction, + * otherwise 0. */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API uint64_t +mdbx_txn_id(const MDBX_txn *txn); + +/** \brief Latency of commit stages in 1/65536 of seconds units. + * \warning This structure may be changed in future releases. + * \see mdbx_txn_commit_ex() */ +struct MDBX_commit_latency { + /** \brief Duration of preparation (commit child transactions, update + * sub-databases records and cursors destroying). */ + uint32_t preparation; + /** \brief Duration of GC/freeDB handling & updation. */ + uint32_t gc; + /** \brief Duration of internal audit if enabled. */ + uint32_t audit; + /** \brief Duration of writing dirty/modified data pages. */ + uint32_t write; + /** \brief Duration of syncing written data to the dist/storage. */ + uint32_t sync; + /** \brief Duration of transaction ending (releasing resources). */ + uint32_t ending; + /** \brief The total duration of a commit. */ + uint32_t whole; +}; +#ifndef __cplusplus +/** \ingroup c_statinfo */ +typedef struct MDBX_commit_latency MDBX_commit_latency; +#endif + +/** \brief Commit all the operations of a transaction into the database and + * collect latency information. + * \see mdbx_txn_commit() + * \ingroup c_statinfo + * \warning This function may be changed in future releases. */ +LIBMDBX_API int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency); + +/** \brief Commit all the operations of a transaction into the database. + * \ingroup c_transactions + * + * If the current thread is not eligible to manage the transaction then + * the \ref MDBX_THREAD_MISMATCH error will returned. Otherwise the transaction + * will be committed and its handle is freed. If the transaction cannot + * be committed, it will be aborted with the corresponding error returned. + * + * Thus, a result other than \ref MDBX_THREAD_MISMATCH means that the + * transaction is terminated: + * - Resources are released; + * - Transaction handle is invalid; + * - Cursor(s) associated with transaction must not be used, except with + * mdbx_cursor_renew() and \ref mdbx_cursor_close(). + * Such cursor(s) must be closed explicitly by \ref mdbx_cursor_close() + * before or after transaction commit, either can be reused with + * \ref mdbx_cursor_renew() until it will be explicitly closed by + * \ref mdbx_cursor_close(). + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_RESULT_TRUE Transaction was aborted since it should + * be aborted due to previous errors. + * \retval MDBX_PANIC A fatal error occurred earlier + * and the environment must be shut down. + * \retval MDBX_BAD_TXN Transaction is already finished or never began. + * \retval MDBX_EBADSIGN Transaction object has invalid signature, + * e.g. transaction was already terminated + * or memory was corrupted. + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. + * \retval MDBX_EINVAL Transaction handle is NULL. + * \retval MDBX_ENOSPC No more disk space. + * \retval MDBX_EIO A system-level I/O error occurred. + * \retval MDBX_ENOMEM Out of memory. */ +LIBMDBX_INLINE_API(int, mdbx_txn_commit, (MDBX_txn * txn)) { + return mdbx_txn_commit_ex(txn, NULL); +} + +/** \brief Abandon all the operations of the transaction instead of saving them. + * \ingroup c_transactions + * + * The transaction handle is freed. It and its cursors must not be used again + * after this call, except with \ref mdbx_cursor_renew() and + * \ref mdbx_cursor_close(). + * + * If the current thread is not eligible to manage the transaction then + * the \ref MDBX_THREAD_MISMATCH error will returned. Otherwise the transaction + * will be aborted and its handle is freed. Thus, a result other than + * \ref MDBX_THREAD_MISMATCH means that the transaction is terminated: + * - Resources are released; + * - Transaction handle is invalid; + * - Cursor(s) associated with transaction must not be used, except with + * \ref mdbx_cursor_renew() and \ref mdbx_cursor_close(). + * Such cursor(s) must be closed explicitly by \ref mdbx_cursor_close() + * before or after transaction abort, either can be reused with + * \ref mdbx_cursor_renew() until it will be explicitly closed by + * \ref mdbx_cursor_close(). + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_PANIC A fatal error occurred earlier and + * the environment must be shut down. + * \retval MDBX_BAD_TXN Transaction is already finished or never began. + * \retval MDBX_EBADSIGN Transaction object has invalid signature, + * e.g. transaction was already terminated + * or memory was corrupted. + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. + * \retval MDBX_EINVAL Transaction handle is NULL. */ +LIBMDBX_API int mdbx_txn_abort(MDBX_txn *txn); + +/** \brief Marks transaction as broken. + * \ingroup c_transactions + * + * Function keeps the transaction handle and corresponding locks, but makes + * impossible to perform any operations within a broken transaction. + * Broken transaction must then be aborted explicitly later. + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * + * \see mdbx_txn_abort() \see mdbx_txn_reset() \see mdbx_txn_commit() + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_txn_break(MDBX_txn *txn); + +/** \brief Reset a read-only transaction. + * \ingroup c_transactions + * + * Abort the read-only transaction like \ref mdbx_txn_abort(), but keep the + * transaction handle. Therefore \ref mdbx_txn_renew() may reuse the handle. + * This saves allocation overhead if the process will start a new read-only + * transaction soon, and also locking overhead if \ref MDBX_NOTLS is in use. The + * reader table lock is released, but the table slot stays tied to its thread + * or \ref MDBX_txn. Use \ref mdbx_txn_abort() to discard a reset handle, and to + * free its lock table slot if \ref MDBX_NOTLS is in use. + * + * Cursors opened within the transaction must not be used again after this + * call, except with \ref mdbx_cursor_renew() and \ref mdbx_cursor_close(). + * + * Reader locks generally don't interfere with writers, but they keep old + * versions of database pages allocated. Thus they prevent the old pages from + * being reused when writers commit new data, and so under heavy load the + * database size may grow much more rapidly than otherwise. + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_PANIC A fatal error occurred earlier and + * the environment must be shut down. + * \retval MDBX_BAD_TXN Transaction is already finished or never began. + * \retval MDBX_EBADSIGN Transaction object has invalid signature, + * e.g. transaction was already terminated + * or memory was corrupted. + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. + * \retval MDBX_EINVAL Transaction handle is NULL. */ +LIBMDBX_API int mdbx_txn_reset(MDBX_txn *txn); + +/** \brief Renew a read-only transaction. + * \ingroup c_transactions + * + * This acquires a new reader lock for a transaction handle that had been + * released by \ref mdbx_txn_reset(). It must be called before a reset + * transaction may be used again. + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_PANIC A fatal error occurred earlier and + * the environment must be shut down. + * \retval MDBX_BAD_TXN Transaction is already finished or never began. + * \retval MDBX_EBADSIGN Transaction object has invalid signature, + * e.g. transaction was already terminated + * or memory was corrupted. + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. + * \retval MDBX_EINVAL Transaction handle is NULL. */ +LIBMDBX_API int mdbx_txn_renew(MDBX_txn *txn); + +/** \brief The fours integers markers (aka "canary") associated with the + * environment. \ingroup c_crud \see mdbx_canary_set() \see mdbx_canary_get() + * + * The `x`, `y` and `z` values could be set by \ref mdbx_canary_put(), while the + * 'v' will be always set to the transaction number. Updated values becomes + * visible outside the current transaction only after it was committed. Current + * values could be retrieved by \ref mdbx_canary_get(). */ +struct MDBX_canary { + uint64_t x, y, z, v; +}; +#ifndef __cplusplus +/** \ingroup c_crud */ +typedef struct MDBX_canary MDBX_canary; +#endif + +/** \brief Set integers markers (aka "canary") associated with the environment. + * \ingroup c_crud + * \see mdbx_canary_get() + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin() + * \param [in] canary A optional pointer to \ref MDBX_canary structure for `x`, + * `y` and `z` values from. + * - If canary is NOT NULL then the `x`, `y` and `z` values will be + * updated from given canary argument, but the 'v' be always set + * to the current transaction number if at least one `x`, `y` or + * `z` values have changed (i.e. if `x`, `y` and `z` have the same + * values as currently present then nothing will be changes or + * updated). + * - if canary is NULL then the `v` value will be explicitly update + * to the current transaction number without changes `x`, `y` nor + * `z`. + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_canary_put(MDBX_txn *txn, const MDBX_canary *canary); + +/** \brief Returns fours integers markers (aka "canary") associated with the + * environment. + * \ingroup c_crud + * \see mdbx_canary_set() + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * \param [in] canary The address of an MDBX_canary structure where the + * information will be copied. + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_canary_get(const MDBX_txn *txn, MDBX_canary *canary); + +/** \brief A callback function used to compare two keys in a database + * \ingroup c_crud + * \see mdbx_cmp() \see mdbx_get_keycmp() + * \see mdbx_get_datacmp \see mdbx_dcmp() */ +typedef int(MDBX_cmp_func)(const MDBX_val *a, + const MDBX_val *b) MDBX_CXX17_NOEXCEPT; + +/** \brief Open or Create a database in the environment. + * \ingroup c_dbi + * + * A database handle denotes the name and parameters of a database, + * independently of whether such a database exists. The database handle may be + * discarded by calling \ref mdbx_dbi_close(). The old database handle is + * returned if the database was already open. The handle may only be closed + * once. + * + * \note A notable difference between MDBX and LMDB is that MDBX make handles + * opened for existing databases immediately available for other transactions, + * regardless this transaction will be aborted or reset. The REASON for this is + * to avoiding the requirement for multiple opening a same handles in + * concurrent read transactions, and tracking of such open but hidden handles + * until the completion of read transactions which opened them. + * + * Nevertheless, the handle for the NEWLY CREATED database will be invisible + * for other transactions until the this write transaction is successfully + * committed. If the write transaction is aborted the handle will be closed + * automatically. After a successful commit the such handle will reside in the + * shared environment, and may be used by other transactions. + * + * In contrast to LMDB, the MDBX allow this function to be called from multiple + * concurrent transactions or threads in the same process. + * + * To use named database (with name != NULL), \ref mdbx_env_set_maxdbs() + * must be called before opening the environment. Table names are + * keys in the internal unnamed database, and may be read but not written. + * + * \param [in] txn transaction handle returned by \ref mdbx_txn_begin(). + * \param [in] name The name of the database to open. If only a single + * database is needed in the environment, + * this value may be NULL. + * \param [in] flags Special options for this database. This parameter must + * be set to 0 or by bitwise OR'ing together one or more + * of the values described here: + * - \ref MDBX_REVERSEKEY + * Keys are strings to be compared in reverse order, from the end + * of the strings to the beginning. By default, Keys are treated as + * strings and compared from beginning to end. + * - \ref MDBX_INTEGERKEY + * Keys are binary integers in native byte order, either uint32_t or + * uint64_t, and will be sorted as such. The keys must all be of the + * same size and must be aligned while passing as arguments. + * - \ref MDBX_DUPSORT + * Duplicate keys may be used in the database. Or, from another point of + * view, keys may have multiple data items, stored in sorted order. By + * default keys must be unique and may have only a single data item. + * - \ref MDBX_DUPFIXED + * This flag may only be used in combination with \ref MDBX_DUPSORT. This + * option tells the library that the data items for this database are + * all the same size, which allows further optimizations in storage and + * retrieval. When all data items are the same size, the + * \ref MDBX_GET_MULTIPLE, \ref MDBX_NEXT_MULTIPLE and + * \ref MDBX_PREV_MULTIPLE cursor operations may be used to retrieve + * multiple items at once. + * - \ref MDBX_INTEGERDUP + * This option specifies that duplicate data items are binary integers, + * similar to \ref MDBX_INTEGERKEY keys. The data values must all be of the + * same size and must be aligned while passing as arguments. + * - \ref MDBX_REVERSEDUP + * This option specifies that duplicate data items should be compared as + * strings in reverse order (the comparison is performed in the direction + * from the last byte to the first). + * - \ref MDBX_CREATE + * Create the named database if it doesn't exist. This option is not + * allowed in a read-only transaction or a read-only environment. + * + * \param [out] dbi Address where the new \ref MDBX_dbi handle + * will be stored. + * + * For \ref mdbx_dbi_open_ex() additional arguments allow you to set custom + * comparison functions for keys and values (for multimaps). + * However, I recommend not using custom comparison functions, but instead + * converting the keys to one of the forms that are suitable for built-in + * comparators (for instance take look to the \ref value2key). + * The reasons to not using custom comparators are: + * - The order of records could not be validated without your code. + * So `mdbx_chk` utility will reports "wrong order" errors + * and the `-i` option is required to ignore ones. + * - A records could not be ordered or sorted without your code. + * So mdbx_load utility should be used with `-a` option to preserve + * input data order. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_NOTFOUND The specified database doesn't exist in the + * environment and \ref MDBX_CREATE was not specified. + * \retval MDBX_DBS_FULL Too many databases have been opened. + * \see mdbx_env_set_maxdbs() + * \retval MDBX_INCOMPATIBLE Database is incompatible with given flags, + * i.e. the passed flags is different with which the + * database was created, or the database was already + * opened with a different comparison function(s). + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. */ +LIBMDBX_API int mdbx_dbi_open(MDBX_txn *txn, const char *name, + MDBX_db_flags_t flags, MDBX_dbi *dbi); + +/** \deprecated Please avoid using custom comparators + * and use mdbx_dbi_open() instead. + * \ingroup c_dbi + * + * \param [in] txn transaction handle returned by \ref mdbx_txn_begin(). + * \param [in] name The name of the database to open. If only a single + * database is needed in the environment, + * this value may be NULL. + * \param [in] flags Special options for this database. + * \param [in] keycmp Optional custom key comparison function for a database. + * \param [in] datacmp Optional custom data comparison function for a database. + * \param [out] dbi Address where the new MDBX_dbi handle will be stored. + * \returns A non-zero error value on failure and 0 on success. */ +MDBX_DEPRECATED LIBMDBX_API int +mdbx_dbi_open_ex(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, + MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp); + +/** \defgroup value2key Value-to-Key functions to avoid custom comparators + * \see key2value + * @{ + * + * The \ref mdbx_key_from_jsonInteger() build a keys which are comparable with + * keys created by \ref mdbx_key_from_double(). So this allows mixing `int64_t` + * and IEEE754 double values in one index for JSON-numbers with restriction for + * integer numbers range corresponding to RFC-7159, i.e. \f$[-2^{53}+1, + * 2^{53}-1]\f$. See bottom of page 6 at https://tools.ietf.org/html/rfc7159 */ +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API uint64_t +mdbx_key_from_jsonInteger(const int64_t json_integer); + +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API uint64_t +mdbx_key_from_double(const double ieee754_64bit); + +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API uint64_t +mdbx_key_from_ptrdouble(const double *const ieee754_64bit); + +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API uint32_t +mdbx_key_from_float(const float ieee754_32bit); + +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API uint32_t +mdbx_key_from_ptrfloat(const float *const ieee754_32bit); + +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_INLINE_API(uint64_t, mdbx_key_from_int64, + (const int64_t i64)) { + return UINT64_C(0x8000000000000000) + i64; +} + +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_INLINE_API(uint32_t, mdbx_key_from_int32, + (const int32_t i32)) { + return UINT32_C(0x80000000) + i32; +} +/** @} */ + +/** \defgroup key2value Key-to-Value functions to avoid custom comparators + * \see value2key + * @{ */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int64_t +mdbx_jsonInteger_from_key(const MDBX_val); + +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API double +mdbx_double_from_key(const MDBX_val); + +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API float +mdbx_float_from_key(const MDBX_val); + +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int32_t +mdbx_int32_from_key(const MDBX_val); + +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int64_t +mdbx_int64_from_key(const MDBX_val); +/** @} */ + +/** \brief Retrieve statistics for a database. + * \ingroup c_statinfo + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). + * \param [out] stat The address of an \ref MDBX_stat structure where + * the statistics will be copied. + * \param [in] bytes The size of \ref MDBX_stat. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *stat, + size_t bytes); + +/** \brief Retrieve depth (bitmask) information of nested dupsort (multi-value) + * B+trees for given database. + * \ingroup c_statinfo + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). + * \param [out] mask The address of an uint32_t value where the bitmask + * will be stored. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. + * \retval MDBX_EINVAL An invalid parameter was specified. + * \retval MDBX_RESULT_TRUE The dbi isn't a dupsort (multi-value) database. */ +LIBMDBX_API int mdbx_dbi_dupsort_depthmask(MDBX_txn *txn, MDBX_dbi dbi, + uint32_t *mask); + +/** \brief DBI state bits returted by \ref mdbx_dbi_flags_ex() + * \ingroup c_statinfo + * \see mdbx_dbi_flags_ex() */ +enum MDBX_dbi_state_t { + /** DB was written in this txn */ + MDBX_DBI_DIRTY = 0x01, + /** Named-DB record is older than txnID */ + MDBX_DBI_STALE = 0x02, + /** Named-DB handle opened in this txn */ + MDBX_DBI_FRESH = 0x04, + /** Named-DB handle created in this txn */ + MDBX_DBI_CREAT = 0x08, +}; +#ifndef __cplusplus +/** \ingroup c_statinfo */ +typedef enum MDBX_dbi_state_t MDBX_dbi_state_t; +#else +DEFINE_ENUM_FLAG_OPERATORS(MDBX_dbi_state_t) +#endif + +/** \brief Retrieve the DB flags and status for a database handle. + * \ingroup c_statinfo + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). + * \param [out] flags Address where the flags will be returned. + * \param [out] state Address where the state will be returned. + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_dbi_flags_ex(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags, + unsigned *state); +/** \brief The shortcut to calling \ref mdbx_dbi_flags_ex() with `state=NULL` + * for discarding it result. \ingroup c_statinfo */ +LIBMDBX_INLINE_API(int, mdbx_dbi_flags, + (MDBX_txn * txn, MDBX_dbi dbi, unsigned *flags)) { + unsigned state; + return mdbx_dbi_flags_ex(txn, dbi, flags, &state); +} + +/** \brief Close a database handle. Normally unnecessary. + * \ingroup c_dbi + * + * Closing a database handle is not necessary, but lets \ref mdbx_dbi_open() + * reuse the handle value. Usually it's better to set a bigger + * \ref mdbx_env_set_maxdbs(), unless that value would be large. + * + * \note Use with care. + * This call is synchronized via mutex with \ref mdbx_dbi_close(), but NOT with + * other transactions running by other threads. The "next" version of libmdbx + * (\ref MithrilDB) will solve this issue. + * + * Handles should only be closed if no other threads are going to reference + * the database handle or one of its cursors any further. Do not close a handle + * if an existing transaction has modified its database. Doing so can cause + * misbehavior from database corruption to errors like \ref MDBX_BAD_DBI + * (since the DB name is gone). + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi); + +/** \brief Empty or delete and close a database. + * \ingroup c_crud + * + * \see mdbx_dbi_close() \see mdbx_dbi_open() + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). + * \param [in] del `false` to empty the DB, `true` to delete it + * from the environment and close the DB handle. + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del); + +/** \brief Get items from a database. + * \ingroup c_crud + * + * This function retrieves key/data pairs from the database. The address + * and length of the data associated with the specified key are returned + * in the structure to which data refers. + * If the database supports duplicate keys (\ref MDBX_DUPSORT) then the + * first data item for the key will be returned. Retrieval of other + * items requires the use of \ref mdbx_cursor_get(). + * + * \note The memory pointed to by the returned values is owned by the + * database. The caller need not dispose of the memory, and may not + * modify it in any way. For values returned in a read-only transaction + * any modification attempts will cause a `SIGSEGV`. + * + * \note Values returned from the database are valid only until a + * subsequent update operation, or the end of the transaction. + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). + * \param [in] key The key to search for in the database. + * \param [in,out] data The data corresponding to the key. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. + * \retval MDBX_NOTFOUND The key was not in the database. + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, + MDBX_val *data); + +/** \brief Get items from a database + * and optionally number of data items for a given key. + * + * \ingroup c_crud + * + * Briefly this function does the same as \ref mdbx_get() with a few + * differences: + * 1. If values_count is NOT NULL, then returns the count + * of multi-values/duplicates for a given key. + * 2. Updates BOTH the key and the data for pointing to the actual key-value + * pair inside the database. + * + * \param [in] txn A transaction handle returned + * by \ref mdbx_txn_begin(). + * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). + * \param [in,out] key The key to search for in the database. + * \param [in,out] data The data corresponding to the key. + * \param [out] values_count The optional address to return number of values + * associated with given key: + * = 0 - in case \ref MDBX_NOTFOUND error; + * = 1 - exactly for databases + * WITHOUT \ref MDBX_DUPSORT; + * >= 1 for databases WITH \ref MDBX_DUPSORT. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. + * \retval MDBX_NOTFOUND The key was not in the database. + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, + MDBX_val *data, size_t *values_count); + +/** \brief Get equal or great item from a database. + * \ingroup c_crud + * + * Briefly this function does the same as \ref mdbx_get() with a few + * differences: + * 1. Return equal or great (due comparison function) key-value + * pair, but not only exactly matching with the key. + * 2. On success return \ref MDBX_SUCCESS if key found exactly, + * and \ref MDBX_RESULT_TRUE otherwise. Moreover, for databases with + * \ref MDBX_DUPSORT flag the data argument also will be used to match over + * multi-value/duplicates, and \ref MDBX_SUCCESS will be returned only when + * BOTH the key and the data match exactly. + * 3. Updates BOTH the key and the data for pointing to the actual key-value + * pair inside the database. + * + * \param [in] txn A transaction handle returned + * by \ref mdbx_txn_begin(). + * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). + * \param [in,out] key The key to search for in the database. + * \param [in,out] data The data corresponding to the key. + * + * \returns A non-zero error value on failure and \ref MDBX_RESULT_FALSE + * or \ref MDBX_RESULT_TRUE on success (as described above). + * Some possible errors are: + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. + * \retval MDBX_NOTFOUND The key was not in the database. + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, + MDBX_val *key, MDBX_val *data); + +/** \brief Store items into a database. + * \ingroup c_crud + * + * This function stores key/data pairs in the database. The default behavior + * is to enter the new key/data pair, replacing any previously existing key + * if duplicates are disallowed, or adding a duplicate data item if + * duplicates are allowed (see \ref MDBX_DUPSORT). + * + * \param [in] txn A transaction handle returned + * by \ref mdbx_txn_begin(). + * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). + * \param [in] key The key to store in the database. + * \param [in,out] data The data to store. + * \param [in] flags Special options for this operation. + * This parameter must be set to 0 or by bitwise OR'ing + * together one or more of the values described here: + * - \ref MDBX_NODUPDATA + * Enter the new key-value pair only if it does not already appear + * in the database. This flag may only be specified if the database + * was opened with \ref MDBX_DUPSORT. The function will return + * \ref MDBX_KEYEXIST if the key/data pair already appears in the database. + * + * - \ref MDBX_NOOVERWRITE + * Enter the new key/data pair only if the key does not already appear + * in the database. The function will return \ref MDBX_KEYEXIST if the key + * already appears in the database, even if the database supports + * duplicates (see \ref MDBX_DUPSORT). The data parameter will be set + * to point to the existing item. + * + * - \ref MDBX_CURRENT + * Update an single existing entry, but not add new ones. The function will + * return \ref MDBX_NOTFOUND if the given key not exist in the database. + * In case multi-values for the given key, with combination of + * the \ref MDBX_ALLDUPS will replace all multi-values, + * otherwise return the \ref MDBX_EMULTIVAL. + * + * - \ref MDBX_RESERVE + * Reserve space for data of the given size, but don't copy the given + * data. Instead, return a pointer to the reserved space, which the + * caller can fill in later - before the next update operation or the + * transaction ends. This saves an extra memcpy if the data is being + * generated later. MDBX does nothing else with this memory, the caller + * is expected to modify all of the space requested. This flag must not + * be specified if the database was opened with \ref MDBX_DUPSORT. + * + * - \ref MDBX_APPEND + * Append the given key/data pair to the end of the database. This option + * allows fast bulk loading when keys are already known to be in the + * correct order. Loading unsorted keys with this flag will cause + * a \ref MDBX_EKEYMISMATCH error. + * + * - \ref MDBX_APPENDDUP + * As above, but for sorted dup data. + * + * - \ref MDBX_MULTIPLE + * Store multiple contiguous data elements in a single request. This flag + * may only be specified if the database was opened with + * \ref MDBX_DUPFIXED. With combination the \ref MDBX_ALLDUPS + * will replace all multi-values. + * The data argument must be an array of two \ref MDBX_val. The `iov_len` + * of the first \ref MDBX_val must be the size of a single data element. + * The `iov_base` of the first \ref MDBX_val must point to the beginning + * of the array of contiguous data elements which must be properly aligned + * in case of database with \ref MDBX_INTEGERDUP flag. + * The `iov_len` of the second \ref MDBX_val must be the count of the + * number of data elements to store. On return this field will be set to + * the count of the number of elements actually written. The `iov_base` of + * the second \ref MDBX_val is unused. + * + * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. + * \retval MDBX_KEYEXIST The key/value pair already exists in the database. + * \retval MDBX_MAP_FULL The database is full, see \ref mdbx_env_set_mapsize(). + * \retval MDBX_TXN_FULL The transaction has too many dirty pages. + * \retval MDBX_EACCES An attempt was made to write + * in a read-only transaction. + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, + MDBX_val *data, MDBX_put_flags_t flags); + +/** \brief Replace items in a database. + * \ingroup c_crud + * + * This function allows to update or delete an existing value at the same time + * as the previous value is retrieved. If the argument new_data equal is NULL + * zero, the removal is performed, otherwise the update/insert. + * + * The current value may be in an already changed (aka dirty) page. In this + * case, the page will be overwritten during the update, and the old value will + * be lost. Therefore, an additional buffer must be passed via old_data + * argument initially to copy the old value. If the buffer passed in is too + * small, the function will return \ref MDBX_RESULT_TRUE by setting iov_len + * field pointed by old_data argument to the appropriate value, without + * performing any changes. + * + * For databases with non-unique keys (i.e. with \ref MDBX_DUPSORT flag), + * another use case is also possible, when by old_data argument selects a + * specific item from multi-value/duplicates with the same key for deletion or + * update. To select this scenario in flags should simultaneously specify + * \ref MDBX_CURRENT and \ref MDBX_NOOVERWRITE. This combination is chosen + * because it makes no sense, and thus allows you to identify the request of + * such a scenario. + * + * \param [in] txn A transaction handle returned + * by \ref mdbx_txn_begin(). + * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). + * \param [in] key The key to store in the database. + * \param [in] new_data The data to store, if NULL then deletion will + * be performed. + * \param [in,out] old_data The buffer for retrieve previous value as describe + * above. + * \param [in] flags Special options for this operation. + * This parameter must be set to 0 or by bitwise + * OR'ing together one or more of the values + * described in \ref mdbx_put() description above, + * and additionally + * (\ref MDBX_CURRENT | \ref MDBX_NOOVERWRITE) + * combination for selection particular item from + * multi-value/duplicates. + * + * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, + MDBX_val *new_data, MDBX_val *old_data, + MDBX_put_flags_t flags); + +typedef int (*MDBX_preserve_func)(void *context, MDBX_val *target, + const void *src, size_t bytes); +LIBMDBX_API int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, + const MDBX_val *key, MDBX_val *new_data, + MDBX_val *old_data, MDBX_put_flags_t flags, + MDBX_preserve_func preserver, + void *preserver_context); + +/** \brief Delete items from a database. + * \ingroup c_crud + * + * This function removes key/data pairs from the database. + * + * \note The data parameter is NOT ignored regardless the database does + * support sorted duplicate data items or not. If the data parameter + * is non-NULL only the matching data item will be deleted. Otherwise, if data + * parameter is NULL, any/all value(s) for specified key will be deleted. + * + * This function will return \ref MDBX_NOTFOUND if the specified key/data + * pair is not in the database. + * + * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). + * \param [in] key The key to delete from the database. + * \param [in] data The data to delete. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_EACCES An attempt was made to write + * in a read-only transaction. + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, + const MDBX_val *data); + +/** \brief Create a cursor handle but not bind it to transaction nor DBI handle. + * \ingroup c_cursors + * + * An capable of operation cursor is associated with a specific transaction and + * database. A cursor cannot be used when its database handle is closed. Nor + * when its transaction has ended, except with \ref mdbx_cursor_bind() and + * \ref mdbx_cursor_renew(). + * Also it can be discarded with \ref mdbx_cursor_close(). + * + * A cursor must be closed explicitly always, before or after its transaction + * ends. It can be reused with \ref mdbx_cursor_bind() + * or \ref mdbx_cursor_renew() before finally closing it. + * + * \note In contrast to LMDB, the MDBX required that any opened cursors can be + * reused and must be freed explicitly, regardless ones was opened in a + * read-only or write transaction. The REASON for this is eliminates ambiguity + * which helps to avoid errors such as: use-after-free, double-free, i.e. + * memory corruption and segfaults. + * + * \param [in] context A pointer to application context to be associated with + * created cursor and could be retrieved by + * \ref mdbx_cursor_get_userctx() until cursor closed. + * + * \returns Created cursor handle or NULL in case out of memory. */ +LIBMDBX_API MDBX_cursor *mdbx_cursor_create(void *context); + +/** \brief Set application information associated with the \ref MDBX_cursor. + * \ingroup c_cursors + * \see mdbx_cursor_get_userctx() + * + * \param [in] cursor An cursor handle returned by \ref mdbx_cursor_create() + * or \ref mdbx_cursor_open(). + * \param [in] ctx An arbitrary pointer for whatever the application needs. + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_cursor_set_userctx(MDBX_cursor *cursor, void *ctx); + +/** \brief Get the application information associated with the MDBX_cursor. + * \ingroup c_cursors + * \see mdbx_cursor_set_userctx() + * + * \param [in] cursor An cursor handle returned by \ref mdbx_cursor_create() + * or \ref mdbx_cursor_open(). + * \returns The pointer which was passed via the `context` parameter + * of `mdbx_cursor_create()` or set by \ref mdbx_cursor_set_userctx(), + * or `NULL` if something wrong. */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API void * +mdbx_cursor_get_userctx(const MDBX_cursor *cursor); + +/** \brief Bind cursor to specified transaction and DBI handle. + * \ingroup c_cursors + * + * Using of the `mdbx_cursor_bind()` is equivalent to calling + * \ref mdbx_cursor_renew() but with specifying an arbitrary dbi handle. + * + * An capable of operation cursor is associated with a specific transaction and + * database. The cursor may be associated with a new transaction, + * and referencing a new or the same database handle as it was created with. + * This may be done whether the previous transaction is live or dead. + * + * \note In contrast to LMDB, the MDBX required that any opened cursors can be + * reused and must be freed explicitly, regardless ones was opened in a + * read-only or write transaction. The REASON for this is eliminates ambiguity + * which helps to avoid errors such as: use-after-free, double-free, i.e. + * memory corruption and segfaults. + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). + * \param [out] cursor A cursor handle returned by \ref mdbx_cursor_create(). + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *cursor, + MDBX_dbi dbi); + +/** \brief Create a cursor handle for the specified transaction and DBI handle. + * \ingroup c_cursors + * + * Using of the `mdbx_cursor_open()` is equivalent to calling + * \ref mdbx_cursor_create() and then \ref mdbx_cursor_bind() functions. + * + * An capable of operation cursor is associated with a specific transaction and + * database. A cursor cannot be used when its database handle is closed. Nor + * when its transaction has ended, except with \ref mdbx_cursor_bind() and + * \ref mdbx_cursor_renew(). + * Also it can be discarded with \ref mdbx_cursor_close(). + * + * A cursor must be closed explicitly always, before or after its transaction + * ends. It can be reused with \ref mdbx_cursor_bind() + * or \ref mdbx_cursor_renew() before finally closing it. + * + * \note In contrast to LMDB, the MDBX required that any opened cursors can be + * reused and must be freed explicitly, regardless ones was opened in a + * read-only or write transaction. The REASON for this is eliminates ambiguity + * which helps to avoid errors such as: use-after-free, double-free, i.e. + * memory corruption and segfaults. + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). + * \param [out] cursor Address where the new \ref MDBX_cursor handle will be + * stored. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_cursor_open(MDBX_txn *txn, MDBX_dbi dbi, + MDBX_cursor **cursor); + +/** \brief Close a cursor handle. + * \ingroup c_cursors + * + * The cursor handle will be freed and must not be used again after this call, + * but its transaction may still be live. + * + * \note In contrast to LMDB, the MDBX required that any opened cursors can be + * reused and must be freed explicitly, regardless ones was opened in a + * read-only or write transaction. The REASON for this is eliminates ambiguity + * which helps to avoid errors such as: use-after-free, double-free, i.e. + * memory corruption and segfaults. + * + * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open() + * or \ref mdbx_cursor_create(). */ +LIBMDBX_API void mdbx_cursor_close(MDBX_cursor *cursor); + +/** \brief Renew a cursor handle. + * \ingroup c_cursors + * + * An capable of operation cursor is associated with a specific transaction and + * database. The cursor may be associated with a new transaction, + * and referencing a new or the same database handle as it was created with. + * This may be done whether the previous transaction is live or dead. + * + * Using of the `mdbx_cursor_renew()` is equivalent to calling + * \ref mdbx_cursor_bind() with the DBI handle that previously + * the cursor was used with. + * + * \note In contrast to LMDB, the MDBX allow any cursor to be re-used by using + * \ref mdbx_cursor_renew(), to avoid unnecessary malloc/free overhead until it + * freed by \ref mdbx_cursor_close(). + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_cursor_renew(MDBX_txn *txn, MDBX_cursor *cursor); + +/** \brief Return the cursor's transaction handle. + * \ingroup c_cursors + * + * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API MDBX_txn * +mdbx_cursor_txn(const MDBX_cursor *cursor); + +/** \brief Return the cursor's database handle. + * \ingroup c_cursors + * + * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). */ +LIBMDBX_API MDBX_dbi mdbx_cursor_dbi(const MDBX_cursor *cursor); + +/** \brief Copy cursor position and state. + * \ingroup c_cursors + * + * \param [in] src A source cursor handle returned + * by \ref mdbx_cursor_create() or \ref mdbx_cursor_open(). + * + * \param [in,out] dest A destination cursor handle returned + * by \ref mdbx_cursor_create() or \ref mdbx_cursor_open(). + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest); + +/** \brief Retrieve by cursor. + * \ingroup c_crud + * + * This function retrieves key/data pairs from the database. The address and + * length of the key are returned in the object to which key refers (except + * for the case of the \ref MDBX_SET option, in which the key object is + * unchanged), and the address and length of the data are returned in the object + * to which data refers. + * \see mdbx_get() + * + * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). + * \param [in,out] key The key for a retrieved item. + * \param [in,out] data The data of a retrieved item. + * \param [in] op A cursor operation \ref MDBX_cursor_op. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. + * \retval MDBX_NOTFOUND No matching key found. + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_cursor_get(MDBX_cursor *cursor, MDBX_val *key, + MDBX_val *data, MDBX_cursor_op op); + +/** \brief Store by cursor. + * \ingroup c_crud + * + * This function stores key/data pairs into the database. The cursor is + * positioned at the new item, or on failure usually near it. + * + * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). + * \param [in] key The key operated on. + * \param [in,out] data The data operated on. + * \param [in] flags Options for this operation. This parameter + * must be set to 0 or by bitwise OR'ing together + * one or more of the values described here: + * - \ref MDBX_CURRENT + * Replace the item at the current cursor position. The key parameter + * must still be provided, and must match it, otherwise the function + * return \ref MDBX_EKEYMISMATCH. With combination the + * \ref MDBX_ALLDUPS will replace all multi-values. + * + * \note MDBX allows (unlike LMDB) you to change the size of the data and + * automatically handles reordering for sorted duplicates + * (see \ref MDBX_DUPSORT). + * + * - \ref MDBX_NODUPDATA + * Enter the new key-value pair only if it does not already appear in the + * database. This flag may only be specified if the database was opened + * with \ref MDBX_DUPSORT. The function will return \ref MDBX_KEYEXIST + * if the key/data pair already appears in the database. + * + * - \ref MDBX_NOOVERWRITE + * Enter the new key/data pair only if the key does not already appear + * in the database. The function will return \ref MDBX_KEYEXIST if the key + * already appears in the database, even if the database supports + * duplicates (\ref MDBX_DUPSORT). + * + * - \ref MDBX_RESERVE + * Reserve space for data of the given size, but don't copy the given + * data. Instead, return a pointer to the reserved space, which the + * caller can fill in later - before the next update operation or the + * transaction ends. This saves an extra memcpy if the data is being + * generated later. This flag must not be specified if the database + * was opened with \ref MDBX_DUPSORT. + * + * - \ref MDBX_APPEND + * Append the given key/data pair to the end of the database. No key + * comparisons are performed. This option allows fast bulk loading when + * keys are already known to be in the correct order. Loading unsorted + * keys with this flag will cause a \ref MDBX_KEYEXIST error. + * + * - \ref MDBX_APPENDDUP + * As above, but for sorted dup data. + * + * - \ref MDBX_MULTIPLE + * Store multiple contiguous data elements in a single request. This flag + * may only be specified if the database was opened with + * \ref MDBX_DUPFIXED. With combination the \ref MDBX_ALLDUPS + * will replace all multi-values. + * The data argument must be an array of two \ref MDBX_val. The `iov_len` + * of the first \ref MDBX_val must be the size of a single data element. + * The `iov_base` of the first \ref MDBX_val must point to the beginning + * of the array of contiguous data elements which must be properly aligned + * in case of database with \ref MDBX_INTEGERDUP flag. + * The `iov_len` of the second \ref MDBX_val must be the count of the + * number of data elements to store. On return this field will be set to + * the count of the number of elements actually written. The `iov_base` of + * the second \ref MDBX_val is unused. + * + * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. + * \retval MDBX_EKEYMISMATCH The given key value is mismatched to the current + * cursor position + * \retval MDBX_MAP_FULL The database is full, + * see \ref mdbx_env_set_mapsize(). + * \retval MDBX_TXN_FULL The transaction has too many dirty pages. + * \retval MDBX_EACCES An attempt was made to write in a read-only + * transaction. + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_cursor_put(MDBX_cursor *cursor, const MDBX_val *key, + MDBX_val *data, MDBX_put_flags_t flags); + +/** \brief Delete current key/data pair. + * \ingroup c_crud + * + * This function deletes the key/data pair to which the cursor refers. This + * does not invalidate the cursor, so operations such as \ref MDBX_NEXT can + * still be used on it. Both \ref MDBX_NEXT and \ref MDBX_GET_CURRENT will + * return the same record after this operation. + * + * \param [in] cursor A cursor handle returned by mdbx_cursor_open(). + * \param [in] flags Options for this operation. This parameter must be set + * to one of the values described here. + * + * - \ref MDBX_CURRENT Delete only single entry at current cursor position. + * - \ref MDBX_ALLDUPS + * or \ref MDBX_NODUPDATA (supported for compatibility) + * Delete all of the data items for the current key. This flag has effect + * only for database(s) was created with \ref MDBX_DUPSORT. + * + * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. + * \retval MDBX_MAP_FULL The database is full, + * see \ref mdbx_env_set_mapsize(). + * \retval MDBX_TXN_FULL The transaction has too many dirty pages. + * \retval MDBX_EACCES An attempt was made to write in a read-only + * transaction. + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_cursor_del(MDBX_cursor *cursor, MDBX_put_flags_t flags); + +/** \brief Return count of duplicates for current key. + * \ingroup c_crud + * + * This call is valid for all databases, but reasonable only for that support + * sorted duplicate data items \ref MDBX_DUPSORT. + * + * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). + * \param [out] pcount Address where the count will be stored. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. + * \retval MDBX_EINVAL Cursor is not initialized, or an invalid parameter + * was specified. */ +LIBMDBX_API int mdbx_cursor_count(const MDBX_cursor *cursor, size_t *pcount); + +/** \brief Determines whether the cursor is pointed to a key-value pair or not, + * i.e. was not positioned or points to the end of data. + * \ingroup c_cursors + * + * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). + * + * \returns A \ref MDBX_RESULT_TRUE or \ref MDBX_RESULT_FALSE value, + * otherwise the error code: + * \retval MDBX_RESULT_TRUE No more data available or cursor not + * positioned + * \retval MDBX_RESULT_FALSE A data is available + * \retval Otherwise the error code */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int +mdbx_cursor_eof(const MDBX_cursor *cursor); + +/** \brief Determines whether the cursor is pointed to the first key-value pair + * or not. \ingroup c_cursors + * + * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). + * + * \returns A MDBX_RESULT_TRUE or MDBX_RESULT_FALSE value, + * otherwise the error code: + * \retval MDBX_RESULT_TRUE Cursor positioned to the first key-value pair + * \retval MDBX_RESULT_FALSE Cursor NOT positioned to the first key-value + * pair \retval Otherwise the error code */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int +mdbx_cursor_on_first(const MDBX_cursor *cursor); + +/** \brief Determines whether the cursor is pointed to the last key-value pair + * or not. \ingroup c_cursors + * + * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). + * + * \returns A \ref MDBX_RESULT_TRUE or \ref MDBX_RESULT_FALSE value, + * otherwise the error code: + * \retval MDBX_RESULT_TRUE Cursor positioned to the last key-value pair + * \retval MDBX_RESULT_FALSE Cursor NOT positioned to the last key-value pair + * \retval Otherwise the error code */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int +mdbx_cursor_on_last(const MDBX_cursor *cursor); + +/** \addtogroup c_rqest + * \details \note The estimation result varies greatly depending on the filling + * of specific pages and the overall balance of the b-tree: + * + * 1. The number of items is estimated by analyzing the height and fullness of + * the b-tree. The accuracy of the result directly depends on the balance of + * the b-tree, which in turn is determined by the history of previous + * insert/delete operations and the nature of the data (i.e. variability of + * keys length and so on). Therefore, the accuracy of the estimation can vary + * greatly in a particular situation. + * + * 2. To understand the potential spread of results, you should consider a + * possible situations basing on the general criteria for splitting and merging + * b-tree pages: + * - the page is split into two when there is no space for added data; + * - two pages merge if the result fits in half a page; + * - thus, the b-tree can consist of an arbitrary combination of pages filled + * both completely and only 1/4. Therefore, in the worst case, the result + * can diverge 4 times for each level of the b-tree excepting the first and + * the last. + * + * 3. In practice, the probability of extreme cases of the above situation is + * close to zero and in most cases the error does not exceed a few percent. On + * the other hand, it's just a chance you shouldn't overestimate. */ + +/** \brief Estimates the distance between cursors as a number of elements. + * \ingroup c_rqest + * + * This function performs a rough estimate based only on b-tree pages that are + * common for the both cursor's stacks. The results of such estimation can be + * used to build and/or optimize query execution plans. + * + * Please see notes on accuracy of the result in the details + * of \ref c_rqest section. + * + * Both cursors must be initialized for the same database and the same + * transaction. + * + * \param [in] first The first cursor for estimation. + * \param [in] last The second cursor for estimation. + * \param [out] distance_items The pointer to store estimated distance value, + * i.e. `*distance_items = distance(first, last)`. + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_estimate_distance(const MDBX_cursor *first, + const MDBX_cursor *last, + ptrdiff_t *distance_items); + +/** \brief Estimates the move distance. + * \ingroup c_rqest + * + * This function performs a rough estimate distance between the current + * cursor position and next position after the specified move-operation with + * given key and data. The results of such estimation can be used to build + * and/or optimize query execution plans. Current cursor position and state are + * preserved. + * + * Please see notes on accuracy of the result in the details + * of \ref c_rqest section. + * + * \param [in] cursor Cursor for estimation. + * \param [in,out] key The key for a retrieved item. + * \param [in,out] data The data of a retrieved item. + * \param [in] move_op A cursor operation \ref MDBX_cursor_op. + * \param [out] distance_items A pointer to store estimated move distance + * as the number of elements. + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, + MDBX_val *data, MDBX_cursor_op move_op, + ptrdiff_t *distance_items); + +/** \brief Estimates the size of a range as a number of elements. + * \ingroup c_rqest + * + * The results of such estimation can be used to build and/or optimize query + * execution plans. + * + * Please see notes on accuracy of the result in the details + * of \ref c_rqest section. + * + * + * \param [in] txn A transaction handle returned + * by \ref mdbx_txn_begin(). + * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). + * \param [in] begin_key The key of range beginning or NULL for explicit FIRST. + * \param [in] begin_data Optional additional data to seeking among sorted + * duplicates. + * Only for \ref MDBX_DUPSORT, NULL otherwise. + * \param [in] end_key The key of range ending or NULL for explicit LAST. + * \param [in] end_data Optional additional data to seeking among sorted + * duplicates. + * Only for \ref MDBX_DUPSORT, NULL otherwise. + * \param [out] distance_items A pointer to store range estimation result. + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, + MDBX_val *begin_key, MDBX_val *begin_data, + MDBX_val *end_key, MDBX_val *end_data, + ptrdiff_t *distance_items); + +/** \brief The EPSILON value for mdbx_estimate_range() + * \ingroup c_rqest */ +#define MDBX_EPSILON ((MDBX_val *)((ptrdiff_t)-1)) + +/** \brief Determines whether the given address is on a dirty database page of + * the transaction or not. \ingroup c_statinfo + * + * Ultimately, this allows to avoid copy data from non-dirty pages. + * + * "Dirty" pages are those that have already been changed during a write + * transaction. Accordingly, any further changes may result in such pages being + * overwritten. Therefore, all functions libmdbx performing changes inside the + * database as arguments should NOT get pointers to data in those pages. In + * turn, "not dirty" pages before modification will be copied. + * + * In other words, data from dirty pages must either be copied before being + * passed as arguments for further processing or rejected at the argument + * validation stage. Thus, `mdbx_is_dirty()` allows you to get rid of + * unnecessary copying, and perform a more complete check of the arguments. + * + * \note The address passed must point to the beginning of the data. This is + * the only way to ensure that the actual page header is physically located in + * the same memory page, including for multi-pages with long data. + * + * \note In rare cases the function may return a false positive answer + * (\ref MDBX_RESULT_TRUE when data is NOT on a dirty page), but never a false + * negative if the arguments are correct. + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * \param [in] ptr The address of data to check. + * + * \returns A MDBX_RESULT_TRUE or MDBX_RESULT_FALSE value, + * otherwise the error code: + * \retval MDBX_RESULT_TRUE Given address is on the dirty page. + * \retval MDBX_RESULT_FALSE Given address is NOT on the dirty page. + * \retval Otherwise the error code. */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int mdbx_is_dirty(const MDBX_txn *txn, + const void *ptr); + +/** \brief Sequence generation for a database. + * \ingroup c_crud + * + * The function allows to create a linear sequence of unique positive integers + * for each database. The function can be called for a read transaction to + * retrieve the current sequence value, and the increment must be zero. + * Sequence changes become visible outside the current write transaction after + * it is committed, and discarded on abort. + * + * \param [in] txn A transaction handle returned + * by \ref mdbx_txn_begin(). + * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). + * \param [out] result The optional address where the value of sequence + * before the change will be stored. + * \param [in] increment Value to increase the sequence, + * must be 0 for read-only transactions. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_RESULT_TRUE Increasing the sequence has resulted in an + * overflow and therefore cannot be executed. */ +LIBMDBX_API int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, + uint64_t increment); + +/** \brief Compare two keys according to a particular database. + * \ingroup c_crud + * + * This returns a comparison as if the two data items were keys in the + * specified database. + * + * \warning There ss a Undefined behavior if one of arguments is invalid. + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). + * \param [in] a The first item to compare. + * \param [in] b The second item to compare. + * + * \returns < 0 if a < b, 0 if a == b, > 0 if a > b */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int mdbx_cmp(const MDBX_txn *txn, + MDBX_dbi dbi, + const MDBX_val *a, + const MDBX_val *b); + +/** \brief Returns default internal key's comparator for given database flags. + * \ingroup c_extra */ +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API MDBX_cmp_func * +mdbx_get_keycmp(MDBX_db_flags_t flags); + +/** \brief Compare two data items according to a particular database. + * \ingroup c_crud + * + * This returns a comparison as if the two items were data items of the + * specified database. + * + * \warning There ss a Undefined behavior if one of arguments is invalid. + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). + * \param [in] a The first item to compare. + * \param [in] b The second item to compare. + * + * \returns < 0 if a < b, 0 if a == b, > 0 if a > b */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int mdbx_dcmp(const MDBX_txn *txn, + MDBX_dbi dbi, + const MDBX_val *a, + const MDBX_val *b); + +/** \brief Returns default internal data's comparator for given database flags + * \ingroup c_extra */ +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API MDBX_cmp_func * +mdbx_get_datacmp(MDBX_db_flags_t flags); + +/** \brief A callback function used to enumerate the reader lock table. + * \ingroup c_statinfo + * + * \param [in] ctx An arbitrary context pointer for the callback. + * \param [in] num The serial number during enumeration, + * starting from 1. + * \param [in] slot The reader lock table slot number. + * \param [in] txnid The ID of the transaction being read, + * i.e. the MVCC-snapshot number. + * \param [in] lag The lag from a recent MVCC-snapshot, + * i.e. the number of committed write transactions + * since the current read transaction started. + * \param [in] pid The reader process ID. + * \param [in] thread The reader thread ID. + * \param [in] bytes_used The number of last used page in the MVCC-snapshot + * which being read, + * i.e. database file can't shrinked beyond this. + * \param [in] bytes_retired The total size of the database pages that were + * retired by committed write transactions after + * the reader's MVCC-snapshot, + * i.e. the space which would be freed after + * the Reader releases the MVCC-snapshot + * for reuse by completion read transaction. + * + * \returns < 0 on failure, >= 0 on success. \see mdbx_reader_list() */ +typedef int(MDBX_reader_list_func)(void *ctx, int num, int slot, mdbx_pid_t pid, + mdbx_tid_t thread, uint64_t txnid, + uint64_t lag, size_t bytes_used, + size_t bytes_retained) MDBX_CXX17_NOEXCEPT; + +/** \brief Enumerate the entries in the reader lock table. + * + * \ingroup c_statinfo + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * \param [in] func A \ref MDBX_reader_list_func function. + * \param [in] ctx An arbitrary context pointer for the enumeration + * function. + * + * \returns A non-zero error value on failure and 0 on success, + * or \ref MDBX_RESULT_TRUE if the reader lock table is empty. */ +LIBMDBX_API int mdbx_reader_list(const MDBX_env *env, + MDBX_reader_list_func *func, void *ctx); + +/** \brief Check for stale entries in the reader lock table. + * \ingroup c_extra + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * \param [out] dead Number of stale slots that were cleared. + * + * \returns A non-zero error value on failure and 0 on success, + * or \ref MDBX_RESULT_TRUE if a dead reader(s) found or mutex was recovered. */ +LIBMDBX_API int mdbx_reader_check(MDBX_env *env, int *dead); + +/** \brief Returns a lag of the reading for the given transaction. + * \ingroup c_statinfo + * + * Returns an information for estimate how much given read-only + * transaction is lagging relative the to actual head. + * \deprecated Please use \ref mdbx_txn_info() instead. + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * \param [out] percent Percentage of page allocation in the database. + * + * \returns Number of transactions committed after the given was started for + * read, or negative value on failure. */ +MDBX_DEPRECATED LIBMDBX_API int mdbx_txn_straggler(const MDBX_txn *txn, + int *percent); + +/** \brief Registers the current thread as a reader for the environment. + * \ingroup c_extra + * + * To perform read operations without blocking, a reader slot must be assigned + * for each thread. However, this assignment requires a short-term lock + * acquisition which is performed automatically. This function allows you to + * assign the reader slot in advance and thus avoid capturing the blocker when + * the read transaction starts firstly from current thread. + * \see mdbx_thread_unregister() + * + * \note Threads are registered automatically the first time a read transaction + * starts. Therefore, there is no need to use this function, except in + * special cases. + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * + * \returns A non-zero error value on failure and 0 on success, + * or \ref MDBX_RESULT_TRUE if thread is already registered. */ +LIBMDBX_API int mdbx_thread_register(const MDBX_env *env); + +/** \brief Unregisters the current thread as a reader for the environment. + * \ingroup c_extra + * + * To perform read operations without blocking, a reader slot must be assigned + * for each thread. However, the assigned reader slot will remain occupied until + * the thread ends or the environment closes. This function allows you to + * explicitly release the assigned reader slot. + * \see mdbx_thread_register() + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * + * \returns A non-zero error value on failure and 0 on success, or + * \ref MDBX_RESULT_TRUE if thread is not registered or already unregistered. */ +LIBMDBX_API int mdbx_thread_unregister(const MDBX_env *env); + +/** \brief A Handle-Slow-Readers callback function to resolve database + * full/overflow issue due to a reader(s) which prevents the old data from being + * recycled. + * \ingroup c_err + * + * Read transactions prevent reuse of pages freed by newer write transactions, + * thus the database can grow quickly. This callback will be called when there + * is not enough space in the database (i.e. before increasing the database size + * or before \ref MDBX_MAP_FULL error) and thus can be used to resolve issues + * with a "long-lived" read transactions. + * \see long-lived-read + * + * Using this callback you can choose how to resolve the situation: + * - abort the write transaction with an error; + * - wait for the read transaction(s) to complete; + * - notify a thread performing a long-lived read transaction + * and wait for an effect; + * - kill the thread or whole process that performs the long-lived read + * transaction; + * + * Depending on the arguments and needs, your implementation may wait, + * terminate a process or thread that is performing a long read, or perform + * some other action. In doing so it is important that the returned code always + * corresponds to the performed action. + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * \param [in] txn The current write transaction which internally at + * the \ref MDBX_MAP_FULL condition. + * \param [in] pid A pid of the reader process. + * \param [in] tid A thread_id of the reader thread. + * \param [in] laggard An oldest read transaction number on which stalled. + * \param [in] gap A lag from the last committed txn. + * \param [in] space A space that actually become available for reuse after + * this reader finished. The callback function can take + * this value into account to evaluate the impact that + * a long-running transaction has. + * \param [in] retry A retry number starting from 0. + * If callback has returned 0 at least once, then at end + * of current handling loop the callback function will be + * called additionally with negative value to notify about + * the end of loop. The callback function can use this value + * to implement timeout logic while waiting for readers. + * + * \returns The RETURN CODE determines the further actions libmdbx and must + * match the action which was executed by the callback: + * + * \retval -2 or less An error condition and the reader was not killed. + * + * \retval -1 The callback was unable to solve the problem and + * agreed on \ref MDBX_MAP_FULL error; + * libmdbx should increase the database size or + * return \ref MDBX_MAP_FULL error. + * + * \retval 0 (zero) The callback solved the problem or just waited for + * a while, libmdbx should rescan the reader lock table and + * retry. This also includes a situation when corresponding + * transaction terminated in normal way by + * \ref mdbx_txn_abort() or \ref mdbx_txn_reset(), + * and my be restarted. I.e. reader slot don't needed + * to be cleaned from transaction. + * + * \retval 1 Transaction aborted asynchronous and reader slot + * should be cleared immediately, i.e. read transaction + * will not continue but \ref mdbx_txn_abort() + * or \ref mdbx_txn_reset() will be called later. + * + * \retval 2 or great The reader process was terminated or killed, + * and libmdbx should entirely reset reader registration. + * + * \see mdbx_env_set_hsr() \see mdbx_env_get_hsr() + */ +typedef int(MDBX_hsr_func)(const MDBX_env *env, const MDBX_txn *txn, + mdbx_pid_t pid, mdbx_tid_t tid, uint64_t laggard, + unsigned gap, size_t space, + int retry) MDBX_CXX17_NOEXCEPT; + +/** \brief Sets a Handle-Slow-Readers callback to resolve database full/overflow + * issue due to a reader(s) which prevents the old data from being recycled. + * \ingroup c_err + * + * The callback will only be triggered when the database is full due to a + * reader(s) prevents the old data from being recycled. + * + * \see mdbx_env_get_hsr() + * \see long-lived-read + * + * \param [in] env An environment handle returned + * by \ref mdbx_env_create(). + * \param [in] hsr_callback A \ref MDBX_hsr_func function + * or NULL to disable. + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr_callback); + +/** \brief Gets current Handle-Slow-Readers callback used to resolve database + * full/overflow issue due to a reader(s) which prevents the old data from being + * recycled. + * \see mdbx_env_set_hsr() + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * + * \returns A MDBX_hsr_func function or NULL if disabled + * or something wrong. */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API MDBX_hsr_func * +mdbx_env_get_hsr(const MDBX_env *env); + +/** \defgroup btree_traversal B-tree Traversal + * This is internal API for mdbx_chk tool. You should avoid to use it, except + * some extremal special cases. + * \ingroup c_extra + * @{ */ + +/** \brief Page types for traverse the b-tree. + * \see mdbx_env_pgwalk() \see MDBX_pgvisitor_func */ +enum MDBX_page_type_t { + MDBX_page_broken, + MDBX_page_meta, + MDBX_page_large, + MDBX_page_branch, + MDBX_page_leaf, + MDBX_page_dupfixed_leaf, + MDBX_subpage_leaf, + MDBX_subpage_dupfixed_leaf, + MDBX_subpage_broken, +}; +#ifndef __cplusplus +typedef enum MDBX_page_type_t MDBX_page_type_t; +#endif + +/** \brief Pseudo-name for MainDB */ +#define MDBX_PGWALK_MAIN ((const char *)((ptrdiff_t)0)) +/** \brief Pseudo-name for GarbageCollectorDB */ +#define MDBX_PGWALK_GC ((const char *)((ptrdiff_t)-1)) +/** \brief Pseudo-name for MetaPages */ +#define MDBX_PGWALK_META ((const char *)((ptrdiff_t)-2)) + +/** \brief Callback function for traverse the b-tree. \see mdbx_env_pgwalk() */ +typedef int MDBX_pgvisitor_func( + const uint64_t pgno, const unsigned number, void *const ctx, const int deep, + const char *const dbi, const size_t page_size, const MDBX_page_type_t type, + const MDBX_error_t err, const size_t nentries, const size_t payload_bytes, + const size_t header_bytes, const size_t unused_bytes) MDBX_CXX17_NOEXCEPT; + +/** \brief B-tree traversal function. */ +LIBMDBX_API int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, + void *ctx, bool dont_check_keys_ordering); + +/** \brief Open an environment instance using specific meta-page + * for checking and recovery. + * + * This function mostly of internal API for `mdbx_chk` utility and subject to + * change at any time. Do not use this function to avoid shooting your own + * leg(s). */ +LIBMDBX_API int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, + unsigned target_meta, + bool writeable); + +/** \brief Turn database to the specified meta-page. + * + * This function mostly of internal API for `mdbx_chk` utility and subject to + * change at any time. Do not use this function to avoid shooting your own + * leg(s). */ +LIBMDBX_API int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target_meta); + +/** @} B-tree Traversal */ + +/**** Attribute support functions for Nexenta (scheduled for removal) + * *****************************************************************/ +#if defined(MDBX_NEXENTA_ATTRS) || defined(DOXYGEN) +/** \defgroup nexenta Attribute support functions for Nexenta + * \ingroup c_crud + * @{ */ +typedef uint_fast64_t mdbx_attr_t; + +/** Store by cursor with attribute. + * + * This function stores key/data pairs into the database. The cursor is + * positioned at the new item, or on failure usually near it. + * + * \note Internally based on \ref MDBX_RESERVE feature, + * therefore doesn't support \ref MDBX_DUPSORT. + * + * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open() + * \param [in] key The key operated on. + * \param [in] data The data operated on. + * \param [in] attr The attribute. + * \param [in] flags Options for this operation. This parameter must be set + * to 0 or one of the values described here: + * - \ref MDBX_CURRENT + * Replace the item at the current cursor position. The key parameter + * must still be provided, and must match it, otherwise the function + * return \ref MDBX_EKEYMISMATCH. + * + * - \ref MDBX_APPEND + * Append the given key/data pair to the end of the database. No key + * comparisons are performed. This option allows fast bulk loading when + * keys are already known to be in the correct order. Loading unsorted + * keys with this flag will cause a \ref MDBX_KEYEXIST error. + * + * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_EKEYMISMATCH + * \retval MDBX_MAP_FULL The database is full, see \ref mdbx_env_set_mapsize(). + * \retval MDBX_TXN_FULL The transaction has too many dirty pages. + * \retval MDBX_EACCES An attempt was made to write in a read-only + * transaction. + * \retval MDBX_EINVAL an invalid parameter was specified. */ +LIBMDBX_API int mdbx_cursor_put_attr(MDBX_cursor *cursor, MDBX_val *key, + MDBX_val *data, mdbx_attr_t attr, + MDBX_put_flags_t flags); + +/** Store items and attributes into a database. + * + * This function stores key/data pairs in the database. The default behavior + * is to enter the new key/data pair, replacing any previously existing key + * if duplicates are disallowed. + * + * \note Internally based on \ref MDBX_RESERVE feature, + * therefore doesn't support \ref MDBX_DUPSORT. + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). + * \param [in] key The key to store in the database. + * \param [in] attr The attribute to store in the database. + * \param [in,out] data The data to store. + * \param [in] flags Special options for this operation. This parameter + * must be set to 0 or by bitwise OR'ing together one or + * more of the values described here: + * - \ref MDBX_NOOVERWRITE + * Enter the new key/data pair only if the key does not already appear + * in the database. The function will return \ref MDBX_KEYEXIST if the key + * already appears in the database. The data parameter will be set to + * point to the existing item. + * + * - \ref MDBX_CURRENT + * Update an single existing entry, but not add new ones. The function + * will return \ref MDBX_NOTFOUND if the given key not exist in the + * database. Or the \ref MDBX_EMULTIVAL in case duplicates for the given + * key. + * + * - \ref MDBX_APPEND + * Append the given key/data pair to the end of the database. This option + * allows fast bulk loading when keys are already known to be in the + * correct order. Loading unsorted keys with this flag will cause + * a \ref MDBX_EKEYMISMATCH error. + * + * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_KEYEXIST + * \retval MDBX_MAP_FULL The database is full, see \ref mdbx_env_set_mapsize(). + * \retval MDBX_TXN_FULL The transaction has too many dirty pages. + * \retval MDBX_EACCES An attempt was made to write + * in a read-only transaction. + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_put_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, + MDBX_val *data, mdbx_attr_t attr, + MDBX_put_flags_t flags); + +/** Set items attribute from a database. + * + * This function stores key/data pairs attribute to the database. + * + * \note Internally based on \ref MDBX_RESERVE feature, + * therefore doesn't support \ref MDBX_DUPSORT. + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). + * \param [in] key The key to search for in the database. + * \param [in] data The data to be stored or NULL to save previous value. + * \param [in] attr The attribute to be stored. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_NOTFOUND The key-value pair was not in the database. + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, + MDBX_val *data, mdbx_attr_t attr); + +/** Get items attribute from a database cursor. + * + * This function retrieves key/data pairs from the database. The address and + * length of the key are returned in the object to which key refers (except + * for the case of the \ref MDBX_SET option, in which the key object is + * unchanged), and the address and length of the data are returned in the object + * to which data refers. + * \see mdbx_get() + * + * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). + * \param [in,out] key The key for a retrieved item. + * \param [in,out] data The data of a retrieved item. + * \param [out] pattr The pointer to retrieve attribute. + * \param [in] op A cursor operation MDBX_cursor_op. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_NOTFOUND No matching key found. + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_cursor_get_attr(MDBX_cursor *cursor, MDBX_val *key, + MDBX_val *data, mdbx_attr_t *pattr, + MDBX_cursor_op op); + +/** Get items attribute from a database. + * + * This function retrieves key/data pairs from the database. The address + * and length of the data associated with the specified key are returned + * in the structure to which data refers. + * If the database supports duplicate keys (see \ref MDBX_DUPSORT) then the + * first data item for the key will be returned. Retrieval of other + * items requires the use of \ref mdbx_cursor_get(). + * + * \note The memory pointed to by the returned values is owned by the + * database. The caller need not dispose of the memory, and may not + * modify it in any way. For values returned in a read-only transaction + * any modification attempts will cause a `SIGSEGV`. + * + * \note Values returned from the database are valid only until a + * subsequent update operation, or the end of the transaction. + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). + * \param [in] key The key to search for in the database. + * \param [in,out] data The data corresponding to the key. + * \param [out] pattr The pointer to retrieve attribute. + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_NOTFOUND The key was not in the database. + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_get_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, + MDBX_val *data, mdbx_attr_t *pattr); +/** @} end of Attribute support functions for Nexenta */ +#endif /* MDBX_NEXENTA_ATTRS */ + +/** @} end of C API */ + +/******************************************************************************* + * Workaround for mmaped-lookahead-cross-page-boundary bug + * in an obsolete versions of Elbrus's libc and kernels. */ +#if defined(__e2k__) && defined(MDBX_E2K_MLHCPB_WORKAROUND) && \ + MDBX_E2K_MLHCPB_WORKAROUND +LIBMDBX_API int mdbx_e2k_memcmp_bug_workaround(const void *s1, const void *s2, + size_t n); +LIBMDBX_API int mdbx_e2k_strcmp_bug_workaround(const char *s1, const char *s2); +LIBMDBX_API int mdbx_e2k_strncmp_bug_workaround(const char *s1, const char *s2, + size_t n); +LIBMDBX_API size_t mdbx_e2k_strlen_bug_workaround(const char *s); +LIBMDBX_API size_t mdbx_e2k_strnlen_bug_workaround(const char *s, + size_t maxlen); +#ifdef __cplusplus +namespace std { +inline int mdbx_e2k_memcmp_bug_workaround(const void *s1, const void *s2, + size_t n) { + return ::mdbx_e2k_memcmp_bug_workaround(s1, s2, n); +} +inline int mdbx_e2k_strcmp_bug_workaround(const char *s1, const char *s2) { + return ::mdbx_e2k_strcmp_bug_workaround(s1, s2); +} +inline int mdbx_e2k_strncmp_bug_workaround(const char *s1, const char *s2, + size_t n) { + return ::mdbx_e2k_strncmp_bug_workaround(s1, s2, n); +} +inline size_t mdbx_e2k_strlen_bug_workaround(const char *s) { + return ::mdbx_e2k_strlen_bug_workaround(s); +} +inline size_t mdbx_e2k_strnlen_bug_workaround(const char *s, size_t maxlen) { + return ::mdbx_e2k_strnlen_bug_workaround(s, maxlen); +} +} // namespace std +#endif /* __cplusplus */ + +#include +#include +#undef memcmp +#define memcmp mdbx_e2k_memcmp_bug_workaround +#undef bcmp +#define bcmp mdbx_e2k_memcmp_bug_workaround +#undef strcmp +#define strcmp mdbx_e2k_strcmp_bug_workaround +#undef strncmp +#define strncmp mdbx_e2k_strncmp_bug_workaround +#undef strlen +#define strlen mdbx_e2k_strlen_bug_workaround +#undef strnlen +#define strnlen mdbx_e2k_strnlen_bug_workaround +#endif /* MDBX_E2K_MLHCPB_WORKAROUND */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LIBMDBX_H */ diff --git a/mdbx/mdbx_test.go b/mdbx/mdbx_test.go index 5579094..a958cd6 100644 --- a/mdbx/mdbx_test.go +++ b/mdbx/mdbx_test.go @@ -2,8 +2,6 @@ package mdbx import ( "fmt" - "io/ioutil" - "os" "testing" ) @@ -16,15 +14,7 @@ func TestTest1(t *testing.T) { if err1 != nil { t.Fatalf("Cannot set mapsize: %s", err1) } - path, err1 := ioutil.TempDir("", "mdb_test") - if err1 != nil { - t.Fatalf("Cannot create temporary directory") - } - err1 = os.MkdirAll(path, 0770) - defer os.RemoveAll(path) - if err1 != nil { - t.Fatalf("Cannot create directory: %s", path) - } + path := t.TempDir() err1 = env.Open(path, 0, 0664) defer env.Close() if err1 != nil { diff --git a/internal/mdbxarch/width.go b/mdbx/mdbxarch/width.go similarity index 97% rename from internal/mdbxarch/width.go rename to mdbx/mdbxarch/width.go index faf17b1..025edf9 100644 --- a/internal/mdbxarch/width.go +++ b/mdbx/mdbxarch/width.go @@ -1,4 +1,4 @@ -// Package mdbxarch contains some architecture detection constants. The +// Package mdbxarch contains some architecture detection constants. The // primary reason the package exists is because the constant definitions are // scary and some will not pass linters. package mdbxarch diff --git a/mdbx/mdbxgo.c b/mdbx/mdbxgo.c index 98182fb..826f5f2 100644 --- a/mdbx/mdbxgo.c +++ b/mdbx/mdbxgo.c @@ -5,7 +5,7 @@ #include #include "_cgo_export.h" #include "mdbxgo.h" -#include "dist/mdbx.h" +#include "mdbx.h" #define MDBXGO_SET_VAL(val, size, data) \ *(val) = (MDBX_val){.iov_len = (size), .iov_base = (data)} diff --git a/mdbx/mdbxgo.h b/mdbx/mdbxgo.h index abb442d..15079b6 100644 --- a/mdbx/mdbxgo.h +++ b/mdbx/mdbxgo.h @@ -5,7 +5,7 @@ #ifndef _MDBXGO_H_ #define _MDBXGO_H_ -#include "dist/mdbx.h" +#include "mdbx.h" /* Proxy functions for lmdb get/put operations. The functions are defined to * take char* values instead of void* to keep cgo from cheking their data for diff --git a/mdbx/txn.go b/mdbx/txn.go index d27d849..b78bead 100644 --- a/mdbx/txn.go +++ b/mdbx/txn.go @@ -48,6 +48,13 @@ const ( // // See MDBX_txn. type Txn struct { + env *Env + _txn *C.MDBX_txn + key *C.MDBX_val + val *C.MDBX_val + + errLogf func(format string, v ...interface{}) + // If RawRead is true []byte values retrieved from Get() calls on the Txn // and its cursors will point directly into the memory-mapped structure. // Such slices will be readonly and must only be referenced wthin the @@ -67,13 +74,6 @@ type Txn struct { // be paid. The id of a Txn cannot change over its life, even if it is // reset/renewed id uintptr - - env *Env - _txn *C.MDBX_txn - key *C.MDBX_val - val *C.MDBX_val - - errLogf func(format string, v ...interface{}) } // beginTxn does not lock the OS thread which is a prerequisite for creating a @@ -684,24 +684,3 @@ func (txn *Txn) Sequence(dbi DBI, increment uint64) (uint64, error) { } return uint64(res), nil } - -// DBIs - return names of all existing DBIs. Doesn't include Main and GC. -func (txn *Txn) DBIs() ([]string, error) { - var res []string - root, err := txn.OpenRoot(0) - if err != nil { - return nil, err - } - c, err := txn.OpenCursor(root) - if err != nil { - return nil, err - } - defer c.Close() - for k, _, err := c.Get(nil, nil, First); !IsNotFound(err); k, _, err = c.Get(nil, nil, Next) { - if err != nil { - return nil, err - } - res = append(res, string(k)) - } - return res, nil -} diff --git a/mdbx/txn_test.go b/mdbx/txn_test.go index 4759331..d4405fe 100644 --- a/mdbx/txn_test.go +++ b/mdbx/txn_test.go @@ -5,7 +5,6 @@ import ( "bytes" "encoding/binary" "fmt" - "os" "runtime" "syscall" "testing" @@ -13,7 +12,6 @@ import ( func TestTxn_ID(t *testing.T) { env := setup(t) - defer clean(env, t) var id0, id1, id2, id3 uintptr var txnInvalid *Txn @@ -100,7 +98,6 @@ func TestTxn_ID(t *testing.T) { func TestTxn_errLogf(t *testing.T) { env := setup(t) - defer clean(env, t) runtime.LockOSThread() defer runtime.UnlockOSThread() @@ -116,7 +113,6 @@ func TestTxn_errLogf(t *testing.T) { func TestTxn_Drop(t *testing.T) { env := setup(t) - defer clean(env, t) db, err := openDBI(env, "db", Create) if err != nil { @@ -168,7 +164,6 @@ func TestTxn_Drop(t *testing.T) { func TestTxn_Del(t *testing.T) { env := setup(t) - defer clean(env, t) var db DBI err := env.Update(func(txn *Txn) (err error) { @@ -204,7 +199,6 @@ func TestTxn_Del(t *testing.T) { func TestTxn_Del_dup(t *testing.T) { env := setup(t) - defer clean(env, t) var db DBI err := env.Update(func(txn *Txn) (err error) { @@ -246,7 +240,6 @@ func TestTxn_Del_dup(t *testing.T) { func TestTexn_Put_emptyValue(t *testing.T) { env := setup(t) - defer clean(env, t) var db DBI err := env.Update(func(txn *Txn) (err error) { @@ -275,7 +268,6 @@ func TestTexn_Put_emptyValue(t *testing.T) { func TestTxn_PutReserve(t *testing.T) { env := setup(t) - defer clean(env, t) var db DBI err := env.Update(func(txn *Txn) (err error) { @@ -318,7 +310,6 @@ func TestTxn_PutReserve(t *testing.T) { func TestTxn_bytesBuffer(t *testing.T) { env := setup(t) - defer clean(env, t) db, err := openRoot(env, 0) if err != nil { @@ -358,7 +349,6 @@ func TestTxn_bytesBuffer(t *testing.T) { func TestTxn_Put_overwrite(t *testing.T) { env := setup(t) - defer clean(env, t) db, err := openRoot(env, 0) if err != nil { @@ -404,7 +394,6 @@ func TestTxn_Put_overwrite(t *testing.T) { func TestTxn_OpenDBI_emptyName(t *testing.T) { env := setup(t) - defer clean(env, t) err := env.View(func(txn *Txn) (err error) { _, err = txn.OpenDBISimple("", 0) @@ -418,14 +407,19 @@ func TestTxn_OpenDBI_emptyName(t *testing.T) { _, err = txn.OpenDBISimple("", Create) return err }) - if !IsErrnoSys(err, syscall.EACCES) { - t.Errorf("mdb_dbi_open: %v", err) + if runtime.GOOS == "windows" { + if !IsErrnoSys(err, syscall.EIO) { + t.Errorf("mdb_dbi_open: %v", err) + } + } else { + if !IsErrnoSys(err, syscall.EACCES) { + t.Errorf("mdb_dbi_open: %v", err) + } } } func TestTxn_OpenDBI_zero(t *testing.T) { env := setup(t) - defer clean(env, t) err := env.View(func(txn *Txn) (err error) { _, err = txn.OpenRoot(0) @@ -442,7 +436,6 @@ func TestTxn_OpenDBI_zero(t *testing.T) { func TestTxn_Commit_managed(t *testing.T) { env := setup(t) - defer clean(env, t) err := env.View(func(txn *Txn) (err error) { defer func() { @@ -497,8 +490,10 @@ func TestTxn_Commit_managed(t *testing.T) { } func TestTxn_Commit(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("fix me") + } env := setup(t) - defer clean(env, t) runtime.LockOSThread() defer runtime.UnlockOSThread() @@ -511,13 +506,12 @@ func TestTxn_Commit(t *testing.T) { txn.Abort() _, err = txn.Commit() if !IsErrnoSys(err, syscall.EINVAL) { - t.Errorf("mdb_txn_commit: %v", err) + t.Errorf("mdb_txn_commit: %s", err.Error()) } } func TestTxn_Update(t *testing.T) { env := setup(t) - defer clean(env, t) var db DBI err := env.Update(func(txn *Txn) (err error) { @@ -556,11 +550,8 @@ func TestTxn_Flags(t *testing.T) { env := setup(t) path, err := env.Path() if err != nil { - _ = env.Close() - t.Error(err) - return + t.Fatal(err) } - defer os.RemoveAll(path) dbflags := uint(ReverseKey | ReverseDup | DupSort | DupFixed) err = env.Update(func(txn *Txn) (err error) { @@ -597,7 +588,7 @@ func TestTxn_Flags(t *testing.T) { } return nil }) - _ = env.Close() + env.Close() if err != nil { t.Error(err) return @@ -609,7 +600,7 @@ func TestTxn_Flags(t *testing.T) { t.Error(err) return } - err = env.SetMaxDBs(1) + err = env.SetOption(OptMaxDB, uint64(1)) if err != nil { t.Error(err) return @@ -656,20 +647,12 @@ func TestTxn_Flags(t *testing.T) { func TestTxn_Renew(t *testing.T) { env := setup(t) - path, err := env.Path() - if err != nil { - _ = env.Close() - t.Error(err) - return - } - defer os.RemoveAll(path) - defer env.Close() // It is not necessary to call runtime.LockOSThread in this test because // the only unmanaged Txn is Readonly. var dbroot DBI - err = env.Update(func(txn *Txn) (err error) { + err := env.Update(func(txn *Txn) (err error) { dbroot, err = txn.OpenRoot(0) return err }) @@ -721,14 +704,6 @@ func TestTxn_Renew(t *testing.T) { func TestTxn_Reset_doubleReset(t *testing.T) { env := setup(t) - path, err := env.Path() - if err != nil { - env.Close() - t.Error(err) - return - } - defer os.RemoveAll(path) - defer env.Close() txn, err := env.BeginTxn(nil, Readonly) if err != nil { @@ -745,15 +720,6 @@ func TestTxn_Reset_doubleReset(t *testing.T) { // transactions. The transaction may be committed after Reset/Renew are called. func TestTxn_Reset_writeTxn(t *testing.T) { env := setup(t) - path, err := env.Path() - if err != nil { - env.Close() - t.Error(err) - return - } - defer os.RemoveAll(path) - defer env.Close() - runtime.LockOSThread() defer runtime.UnlockOSThread() @@ -776,8 +742,12 @@ func TestTxn_Reset_writeTxn(t *testing.T) { // Reset is a noop and Renew will always error out. txn.Reset() err = txn.Renew() - if !IsErrnoSys(err, syscall.EINVAL) { - t.Errorf("renew: %v", err) + if runtime.GOOS == "windows" { + // todo + } else { + if !IsErrnoSys(err, syscall.EINVAL) { + t.Errorf("renew: %v", err) + } } _, err = txn.Commit() @@ -802,20 +772,12 @@ func TestTxn_Reset_writeTxn(t *testing.T) { func TestTxn_UpdateLocked(t *testing.T) { env := setup(t) - path, err := env.Path() - if err != nil { - env.Close() - t.Error(err) - return - } - defer os.RemoveAll(path) - defer env.Close() runtime.LockOSThread() defer runtime.UnlockOSThread() var dbi DBI - err = env.UpdateLocked(func(txn *Txn) (err error) { + err := env.UpdateLocked(func(txn *Txn) (err error) { dbi, err = txn.OpenRoot(0) if err != nil { return err @@ -843,17 +805,9 @@ func TestTxn_UpdateLocked(t *testing.T) { func TestTxn_RunTxn(t *testing.T) { env := setup(t) - path, err := env.Path() - if err != nil { - env.Close() - t.Error(err) - return - } - defer os.RemoveAll(path) - defer env.Close() var dbi DBI - err = env.RunTxn(0, func(txn *Txn) (err error) { + err := env.RunTxn(0, func(txn *Txn) (err error) { dbi, err = txn.OpenRoot(0) if err != nil { return err @@ -885,17 +839,9 @@ func TestTxn_RunTxn(t *testing.T) { func TestTxn_Stat(t *testing.T) { env := setup(t) - path, err := env.Path() - if err != nil { - env.Close() - t.Error(err) - return - } - defer os.RemoveAll(path) - defer env.Close() var dbi DBI - err = env.Update(func(txn *Txn) (err error) { + err := env.Update(func(txn *Txn) (err error) { dbi, err = txn.OpenDBISimple("testdb", Create) return err }) @@ -935,17 +881,9 @@ func TestTxn_Stat(t *testing.T) { func TestTxn_StatOnEmpty(t *testing.T) { env := setup(t) - path, err := env.Path() - if err != nil { - env.Close() - t.Error(err) - return - } - defer os.RemoveAll(path) - defer env.Close() var dbi DBI - err = env.Update(func(txn *Txn) (err error) { + err := env.Update(func(txn *Txn) (err error) { dbi, err = txn.OpenDBISimple("testdb", Create|DupSort) return err }) @@ -974,104 +912,12 @@ func TestTxn_StatOnEmpty(t *testing.T) { } } -func TestTxn_DBIs(t *testing.T) { - env := setup(t) - defer env.Close() - - if err := env.Update(func(txn *Txn) (err error) { - _, err = txn.OpenDBISimple("test1", Create|DupSort) - if err != nil { - return err - } - _, err = txn.OpenDBISimple("test2", Create) - if err != nil { - return err - } - list, err := txn.DBIs() - if err != nil { - return err - } - if len(list) != 2 { - t.Fatalf("unexpected list of dbi's %+v", list) - } - if list[0] != "test1" { - t.Fatalf("unexpected list of dbi's %+v", list) - } - if list[1] != "test2" { - t.Fatalf("unexpected list of dbi's %+v", list) - } - - return nil - }); err != nil { - t.Errorf("%s", err) - return - } - - if err := env.View(func(txn *Txn) (err error) { - list, err := txn.DBIs() - if err != nil { - return err - } - - if len(list) != 2 { - t.Fatalf("unexpected list of dbi's %+v", list) - } - if list[0] != "test1" { - t.Fatalf("unexpected list of dbi's %+v", list) - } - if list[1] != "test2" { - t.Fatalf("unexpected list of dbi's %+v", list) - } - return nil - }); err != nil { - t.Errorf("%s", err) - return - } - - if err := env.Update(func(txn *Txn) (err error) { - dbi, err := txn.OpenDBI("test1", 0, nil, nil) - if err != nil { - return err - } - - err = txn.Drop(dbi, true) - if err != nil { - return err - } - - list, err := txn.DBIs() - if err != nil { - return err - } - - if len(list) != 1 { - t.Fatalf("unexpected list of dbi's %+v", list) - } - if list[0] != "test2" { - t.Fatalf("unexpected list of dbi's %+v", list) - } - return nil - }); err != nil { - t.Errorf("%s", err) - return - } - -} - func TestSequence(t *testing.T) { env := setup(t) - path, err := env.Path() - if err != nil { - env.Close() - t.Error(err) - return - } - defer os.RemoveAll(path) - defer env.Close() var dbi1 DBI var dbi2 DBI - err = env.Update(func(txn *Txn) (err error) { + err := env.Update(func(txn *Txn) (err error) { dbi1, err = txn.OpenDBISimple("testdb", Create) if err != nil { return err @@ -1138,14 +984,6 @@ func TestSequence(t *testing.T) { func BenchmarkTxn_abort(b *testing.B) { env := setup(b) - path, err := env.Path() - if err != nil { - env.Close() - b.Error(err) - return - } - defer os.RemoveAll(path) - defer env.Close() var e = fmt.Errorf("abort") @@ -1157,17 +995,9 @@ func BenchmarkTxn_abort(b *testing.B) { func BenchmarkTxn_commit(b *testing.B) { env := setup(b) - path, err := env.Path() - if err != nil { - env.Close() - b.Error(err) - return - } - defer os.RemoveAll(path) - defer env.Close() var db DBI - err = env.Update(func(txn *Txn) (err error) { + err := env.Update(func(txn *Txn) (err error) { db, err = txn.OpenDBISimple("testdb", Create) if err != nil { return err @@ -1199,14 +1029,6 @@ func BenchmarkTxn_commit(b *testing.B) { func BenchmarkTxn_ro(b *testing.B) { env := setup(b) - path, err := env.Path() - if err != nil { - _ = env.Close() - b.Error(err) - return - } - defer os.RemoveAll(path) - defer env.Close() b.ResetTimer() for i := 0; i < b.N; i++ { @@ -1220,14 +1042,6 @@ func BenchmarkTxn_ro(b *testing.B) { func BenchmarkTxn_unmanaged_abort(b *testing.B) { env := setup(b) - path, err := env.Path() - if err != nil { - _ = env.Close() - b.Error(err) - return - } - defer os.RemoveAll(path) - defer env.Close() runtime.LockOSThread() defer runtime.UnlockOSThread() @@ -1245,14 +1059,6 @@ func BenchmarkTxn_unmanaged_abort(b *testing.B) { func BenchmarkTxn_unmanaged_commit(b *testing.B) { env := setup(b) - path, err := env.Path() - if err != nil { - _ = env.Close() - b.Error(err) - return - } - defer os.RemoveAll(path) - defer env.Close() runtime.LockOSThread() defer runtime.UnlockOSThread() @@ -1270,14 +1076,6 @@ func BenchmarkTxn_unmanaged_commit(b *testing.B) { func BenchmarkTxn_unmanaged_ro(b *testing.B) { env := setup(b) - path, err := env.Path() - if err != nil { - _ = env.Close() - b.Error(err) - return - } - defer os.RemoveAll(path) - defer env.Close() // It is not necessary to call runtime.LockOSThread here because the txn is // Readonly @@ -1295,14 +1093,6 @@ func BenchmarkTxn_unmanaged_ro(b *testing.B) { func BenchmarkTxn_renew(b *testing.B) { env := setup(b) - path, err := env.Path() - if err != nil { - env.Close() - b.Error(err) - return - } - defer os.RemoveAll(path) - defer env.Close() // It is not necessary to call runtime.LockOSThread here because the txn is // Readonly @@ -1327,15 +1117,7 @@ func BenchmarkTxn_renew(b *testing.B) { func BenchmarkTxn_Put_append(b *testing.B) { env := setup(b) - path, err := env.Path() - if err != nil { - _ = env.Close() - b.Error(err) - return - } - defer os.RemoveAll(path) - defer env.Close() - err = env.SetGeometry(-1, -1, 1024*1024, -1, -1, 4096) + err := env.SetGeometry(-1, -1, 1024*1024, -1, -1, 4096) if err != nil { b.Error(err) return @@ -1375,15 +1157,7 @@ func BenchmarkTxn_Put_append(b *testing.B) { func BenchmarkTxn_Put_append_noflag(b *testing.B) { env := setup(b) - path, err := env.Path() - if err != nil { - _ = env.Close() - b.Error(err) - return - } - defer os.RemoveAll(path) - defer env.Close() - err = env.SetGeometry(-1, -1, 1024*1024, -1, -1, 4096) + err := env.SetGeometry(-1, -1, 1024*1024, -1, -1, 4096) if err != nil { b.Fatalf("Cannot set mapsize: %s", err) } diff --git a/mdbx/val.go b/mdbx/val.go index 8cc8540..54b7c7c 100644 --- a/mdbx/val.go +++ b/mdbx/val.go @@ -4,14 +4,13 @@ package mdbx #include #include #include "mdbxgo.h" -#include "dist/mdbx.h" */ import "C" import ( "unsafe" - "github.com/torquem-ch/mdbx-go/mdbx/internal/arch" + "github.com/torquem-ch/mdbx-go/mdbx/mdbxarch" ) // Just for docs: @@ -32,7 +31,7 @@ import ( // On 64-bit systems, luckily, the value 2^32-1 coincides with the maximum data // size for LMDB (MAXDATASIZE). const ( - valSizeBits = arch.Width64*32 + (1-arch.Width64)*31 + valSizeBits = mdbxarch.Width64*32 + (1-mdbxarch.Width64)*31 valMaxSize = 1<