From f18b7f20c2dca29f7bdad45214c00672e75d7fef Mon Sep 17 00:00:00 2001 From: Valentin Date: Sun, 10 Nov 2024 12:24:33 +0100 Subject: [PATCH] --- .cargo/config.toml | 9 + .github/workflows/check.yml | 50 ++ .gitignore | 1 + Cargo.lock | 473 ++++++++++++++++++ Cargo.toml | 29 ++ benchmark/Cargo.toml | 14 + benchmark/benches/benchmark.rs | 58 +++ changelog.md | 5 + generated assembly/x86_64_default/f32_to_i128 | 20 + generated assembly/x86_64_default/f32_to_i16 | 7 + generated assembly/x86_64_default/f32_to_i32 | 9 + generated assembly/x86_64_default/f32_to_i64 | 9 + generated assembly/x86_64_default/f32_to_i8 | 7 + generated assembly/x86_64_default/f32_to_u128 | 16 + generated assembly/x86_64_default/f32_to_u16 | 7 + generated assembly/x86_64_default/f32_to_u32 | 10 + generated assembly/x86_64_default/f32_to_u64 | 17 + generated assembly/x86_64_default/f32_to_u8 | 7 + generated assembly/x86_64_default/f64_to_i128 | 20 + generated assembly/x86_64_default/f64_to_i16 | 7 + generated assembly/x86_64_default/f64_to_i32 | 8 + generated assembly/x86_64_default/f64_to_i64 | 9 + generated assembly/x86_64_default/f64_to_i8 | 7 + generated assembly/x86_64_default/f64_to_u128 | 16 + generated assembly/x86_64_default/f64_to_u16 | 7 + generated assembly/x86_64_default/f64_to_u32 | 7 + generated assembly/x86_64_default/f64_to_u64 | 17 + generated assembly/x86_64_default/f64_to_u8 | 7 + generated assembly/x86_64_sse/f32_to_i128 | 20 + generated assembly/x86_64_sse/f32_to_i16 | 3 + generated assembly/x86_64_sse/f32_to_i32 | 3 + generated assembly/x86_64_sse/f32_to_i64 | 3 + generated assembly/x86_64_sse/f32_to_i8 | 3 + generated assembly/x86_64_sse/f32_to_u128 | 16 + generated assembly/x86_64_sse/f32_to_u16 | 3 + generated assembly/x86_64_sse/f32_to_u32 | 3 + generated assembly/x86_64_sse/f32_to_u64 | 9 + generated assembly/x86_64_sse/f32_to_u8 | 3 + generated assembly/x86_64_sse/f64_to_i128 | 20 + generated assembly/x86_64_sse/f64_to_i16 | 3 + generated assembly/x86_64_sse/f64_to_i32 | 3 + generated assembly/x86_64_sse/f64_to_i64 | 3 + generated assembly/x86_64_sse/f64_to_i8 | 3 + generated assembly/x86_64_sse/f64_to_u128 | 16 + generated assembly/x86_64_sse/f64_to_u16 | 3 + generated assembly/x86_64_sse/f64_to_u32 | 3 + generated assembly/x86_64_sse/f64_to_u64 | 9 + generated assembly/x86_64_sse/f64_to_u8 | 3 + generated assembly/x86_sse/f32_to_i128 | 52 ++ generated assembly/x86_sse/f32_to_i16 | 3 + generated assembly/x86_sse/f32_to_i32 | 3 + generated assembly/x86_sse/f32_to_i64 | 37 ++ generated assembly/x86_sse/f32_to_i8 | 3 + generated assembly/x86_sse/f32_to_u128 | 47 ++ generated assembly/x86_sse/f32_to_u16 | 3 + generated assembly/x86_sse/f32_to_u32 | 15 + generated assembly/x86_sse/f32_to_u64 | 43 ++ generated assembly/x86_sse/f32_to_u8 | 3 + generated assembly/x86_sse/f64_to_i128 | 52 ++ generated assembly/x86_sse/f64_to_i16 | 3 + generated assembly/x86_sse/f64_to_i32 | 3 + generated assembly/x86_sse/f64_to_i64 | 37 ++ generated assembly/x86_sse/f64_to_i8 | 3 + generated assembly/x86_sse/f64_to_u128 | 47 ++ generated assembly/x86_sse/f64_to_u16 | 3 + generated assembly/x86_sse/f64_to_u32 | 15 + generated assembly/x86_sse/f64_to_u64 | 43 ++ generated assembly/x86_sse/f64_to_u8 | 3 + license | 373 ++++++++++++++ readme.md | 39 ++ src/lib.rs | 166 ++++++ src/target_default.rs | 34 ++ src/target_x86_64_sse.rs | 188 +++++++ src/target_x86_sse.rs | 149 ++++++ tests/test.rs | 162 ++++++ xtask/Cargo.toml | 9 + xtask/src/main.rs | 374 ++++++++++++++ 77 files changed, 2897 insertions(+) create mode 100644 .cargo/config.toml create mode 100644 .github/workflows/check.yml create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 benchmark/Cargo.toml create mode 100644 benchmark/benches/benchmark.rs create mode 100644 changelog.md create mode 100644 generated assembly/x86_64_default/f32_to_i128 create mode 100644 generated assembly/x86_64_default/f32_to_i16 create mode 100644 generated assembly/x86_64_default/f32_to_i32 create mode 100644 generated assembly/x86_64_default/f32_to_i64 create mode 100644 generated assembly/x86_64_default/f32_to_i8 create mode 100644 generated assembly/x86_64_default/f32_to_u128 create mode 100644 generated assembly/x86_64_default/f32_to_u16 create mode 100644 generated assembly/x86_64_default/f32_to_u32 create mode 100644 generated assembly/x86_64_default/f32_to_u64 create mode 100644 generated assembly/x86_64_default/f32_to_u8 create mode 100644 generated assembly/x86_64_default/f64_to_i128 create mode 100644 generated assembly/x86_64_default/f64_to_i16 create mode 100644 generated assembly/x86_64_default/f64_to_i32 create mode 100644 generated assembly/x86_64_default/f64_to_i64 create mode 100644 generated assembly/x86_64_default/f64_to_i8 create mode 100644 generated assembly/x86_64_default/f64_to_u128 create mode 100644 generated assembly/x86_64_default/f64_to_u16 create mode 100644 generated assembly/x86_64_default/f64_to_u32 create mode 100644 generated assembly/x86_64_default/f64_to_u64 create mode 100644 generated assembly/x86_64_default/f64_to_u8 create mode 100644 generated assembly/x86_64_sse/f32_to_i128 create mode 100644 generated assembly/x86_64_sse/f32_to_i16 create mode 100644 generated assembly/x86_64_sse/f32_to_i32 create mode 100644 generated assembly/x86_64_sse/f32_to_i64 create mode 100644 generated assembly/x86_64_sse/f32_to_i8 create mode 100644 generated assembly/x86_64_sse/f32_to_u128 create mode 100644 generated assembly/x86_64_sse/f32_to_u16 create mode 100644 generated assembly/x86_64_sse/f32_to_u32 create mode 100644 generated assembly/x86_64_sse/f32_to_u64 create mode 100644 generated assembly/x86_64_sse/f32_to_u8 create mode 100644 generated assembly/x86_64_sse/f64_to_i128 create mode 100644 generated assembly/x86_64_sse/f64_to_i16 create mode 100644 generated assembly/x86_64_sse/f64_to_i32 create mode 100644 generated assembly/x86_64_sse/f64_to_i64 create mode 100644 generated assembly/x86_64_sse/f64_to_i8 create mode 100644 generated assembly/x86_64_sse/f64_to_u128 create mode 100644 generated assembly/x86_64_sse/f64_to_u16 create mode 100644 generated assembly/x86_64_sse/f64_to_u32 create mode 100644 generated assembly/x86_64_sse/f64_to_u64 create mode 100644 generated assembly/x86_64_sse/f64_to_u8 create mode 100644 generated assembly/x86_sse/f32_to_i128 create mode 100644 generated assembly/x86_sse/f32_to_i16 create mode 100644 generated assembly/x86_sse/f32_to_i32 create mode 100644 generated assembly/x86_sse/f32_to_i64 create mode 100644 generated assembly/x86_sse/f32_to_i8 create mode 100644 generated assembly/x86_sse/f32_to_u128 create mode 100644 generated assembly/x86_sse/f32_to_u16 create mode 100644 generated assembly/x86_sse/f32_to_u32 create mode 100644 generated assembly/x86_sse/f32_to_u64 create mode 100644 generated assembly/x86_sse/f32_to_u8 create mode 100644 generated assembly/x86_sse/f64_to_i128 create mode 100644 generated assembly/x86_sse/f64_to_i16 create mode 100644 generated assembly/x86_sse/f64_to_i32 create mode 100644 generated assembly/x86_sse/f64_to_i64 create mode 100644 generated assembly/x86_sse/f64_to_i8 create mode 100644 generated assembly/x86_sse/f64_to_u128 create mode 100644 generated assembly/x86_sse/f64_to_u16 create mode 100644 generated assembly/x86_sse/f64_to_u32 create mode 100644 generated assembly/x86_sse/f64_to_u64 create mode 100644 generated assembly/x86_sse/f64_to_u8 create mode 100644 license create mode 100644 readme.md create mode 100644 src/lib.rs create mode 100644 src/target_default.rs create mode 100644 src/target_x86_64_sse.rs create mode 100644 src/target_x86_sse.rs create mode 100644 tests/test.rs create mode 100644 xtask/Cargo.toml create mode 100644 xtask/src/main.rs diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 0000000..4e374d5 --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,9 @@ +[alias] +xtask = "run --package xtask --" + +[profile.show-asm] +inherits = "release" +# more determinism +codegen-units=1 +# Values other than "thin" and "fat" cause the assembly to change in an undesirable way. It makes functions that have the same assembly link to eachother via ".set". +lto = "thin" diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml new file mode 100644 index 0000000..8b95940 --- /dev/null +++ b/.github/workflows/check.yml @@ -0,0 +1,50 @@ +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + # We intentionally hardcode the stable/current version for the following reasons: + # + # - It makes it less likely that CI starts failing in the future despite the project not changing. + # - It makes us independent of the default Rust version that the Github runner comes with. + check_stable: + runs-on: ubuntu-24.04 + steps: + - run: sudo apt-get -qq install gcc-multilib qemu-user + - run: | + rustup --quiet toolchain uninstall stable + rustup --quiet toolchain install 1.82 --profile=default + rustup --quiet default 1.82 + - run: cargo install --quiet cargo-show-asm + - uses: actions/checkout@v4 + - run: cargo fmt --check + - run: cargo fetch --quiet --locked + - run: cargo clippy --quiet --workspace --all-targets -- --D=warnings + - run: cargo test --quiet --workspace + - run: cargo build --quiet --package xtask + - run: target/debug/xtask all + - name: Detect changes in generated assembly + run: | + if git status --porcelain -- "generated assembly" | grep ^; then + git diff -- "generated assembly" + echo Generated assembly has changed but the changes were not committed. + exit 1 + fi + + # For the MSRV we only care about the code compiling. + check_minimum_supported_rust_version: + runs-on: ubuntu-24.04 + steps: + - run: sudo apt-get -qq install gcc-multilib + - run: | + rustup --quiet toolchain uninstall stable + rustup --quiet toolchain install 1.82 --profile=default + rustup --quiet toolchain install 1.71 --profile=minimal + rustup --quiet default 1.71 + - uses: actions/checkout@v4 + - run: cargo fetch --quiet --locked + # xtask does not use MSRV because we don't publish it. xtask's internal cargo calls use the default rustup pipeline, which is the MSRV. Note that we cannot run xtask through cargo because that forces the toolchain version for building xtask to be the same as the toolchain version for xtask's internal cargo calls through the RUSTUP_TOOLCHAIN environment variable. + - run: cargo +1.82 build --quiet --package xtask + - run: target/debug/xtask check diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..84dbc8a --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,473 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anyhow" +version = "1.0.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775" + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "benchmark" +version = "0.1.0" +dependencies = [ + "criterion", + "fast-float-to-integer", +] + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "4.5.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.5.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54" +dependencies = [ + "anstyle", + "clap_lex", +] + +[[package]] +name = "clap_lex" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" + +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "either" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" + +[[package]] +name = "fast-float-to-integer" +version = "0.1.0" +dependencies = [ + "cfg-if", + "float_next_after", +] + +[[package]] +name = "float_next_after" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8" + +[[package]] +name = "half" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +dependencies = [ + "cfg-if", + "crunchy", +] + +[[package]] +name = "hermit-abi" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc" + +[[package]] +name = "is-terminal" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" + +[[package]] +name = "libc" +version = "0.2.162" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18d287de67fe55fd7e1581fe933d965a5a9477b38e949cfa9f8574ef01506398" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + +[[package]] +name = "oorandom" +version = "11.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9" + +[[package]] +name = "proc-macro2" +version = "1.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "serde" +version = "1.0.214" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f55c3193aca71c12ad7890f1785d2b73e1b9f63a0bbc353c08ef26fe03fc56b5" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.214" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.132" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "syn" +version = "2.0.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "unicode-ident" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "xtask" +version = "0.1.0" +dependencies = [ + "anyhow", + "regex", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..3897723 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,29 @@ +[workspace] +members = [ + "benchmark", + "xtask", +] + +[package] +name = "fast-float-to-integer" +version = "0.1.0" +authors = [ "Valentin Kettner " ] +edition = "2021" +rust-version = "1.71" +description = "Convert floating point values to integer types faster than the standard `as` operator." +repository = "https://github.com/e00E/fast-float-to-integer" +license = "MPL-2.0" +keywords = [ "float", "floating", "integer", "conversion", "convert" ] +include = [ "/src" ] + +[dependencies] +cfg-if = "1.0" + +[dev-dependencies] +float_next_after = "1.0" + +[features] +# This feature is for internal use. It ensures cargo-show-asm can get the assembly. +show-asm = [ ] +# This feature is for internal use. It disables all target specific code. +force-default = [ ] diff --git a/benchmark/Cargo.toml b/benchmark/Cargo.toml new file mode 100644 index 0000000..6865ba8 --- /dev/null +++ b/benchmark/Cargo.toml @@ -0,0 +1,14 @@ +# This crate contains benchmarks. The benchmarks are not in the main library crate because then carge forces us to compile criterion when compiling tests, which fails on some targets. + +[package] +name = "benchmark" +version = "0.1.0" +edition = "2021" + +[dev-dependencies] +criterion = { version = "0.5", default-features = false, features = [ "cargo_bench_support" ] } +fast-float-to-integer = { path = "..", features=["force-default"] } + +[[bench]] +name = "benchmark" +harness = false diff --git a/benchmark/benches/benchmark.rs b/benchmark/benches/benchmark.rs new file mode 100644 index 0000000..a747a58 --- /dev/null +++ b/benchmark/benches/benchmark.rs @@ -0,0 +1,58 @@ +// Unfortunately, these benchmarks are noisy. There are significant differences in the measured performance based on random code permutation or running the benchmarks at different times or on different machines. The same function benchmarked twice can appear to have very different performance. +// +// We've changed some of the criterion settings to help with this, but the problem persists. It would be nice to have a more real world benchmark. + +use criterion::{criterion_group, criterion_main, Criterion}; +use fast_float_to_integer as ffti; +use std::{hint::black_box, time::Duration}; + +// We create a dependency between the converted numbers so that compiler or CPU cannot skip the computation. +macro_rules! create_benchmark { + ($c:ident, $name:literal, $function:path, $Float:ty) => { + let floats = [0 as $Float; 1_000]; + $c.bench_function($name, |b| { + b.iter(|| { + let mut result = 0; + for float in black_box(floats.as_slice()) { + let converted = $function(*float); + result ^= converted; + } + black_box(result); + }) + }); + }; +} + +pub fn benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("complex"); + group + .sample_size(10_000) + .measurement_time(Duration::from_secs_f32(1.0)) + .warm_up_time(Duration::from_secs_f32(0.1)) + .nresamples(1); + + create_benchmark! {group, "f32_to_i8_optimized", ffti::f32_to_i8, f32} + create_benchmark! {group, "f32_to_u8_optimized", ffti::f32_to_u8, f32} + create_benchmark! {group, "f32_to_i16_optimized", ffti::f32_to_i16, f32} + create_benchmark! {group, "f32_to_u16_optimized", ffti::f32_to_u16, f32} + create_benchmark! {group, "f32_to_i32_optimized", ffti::f32_to_i32, f32} + create_benchmark! {group, "f32_to_u32_optimized", ffti::f32_to_u32, f32} + create_benchmark! {group, "f32_to_i64_optimized", ffti::f32_to_i64, f32} + create_benchmark! {group, "f32_to_u64_optimized", ffti::f32_to_u64, f32} + create_benchmark! {group, "f32_to_i128_optimized", ffti::f32_to_i128, f32} + create_benchmark! {group, "f32_to_u128_optimized", ffti::f32_to_u128, f32} + + create_benchmark! {group, "f64_to_i8_optimized", ffti::f64_to_i8, f64} + create_benchmark! {group, "f64_to_u8_optimized", ffti::f64_to_u8, f64} + create_benchmark! {group, "f64_to_i16_optimized", ffti::f64_to_i16, f64} + create_benchmark! {group, "f64_to_u16_optimized", ffti::f64_to_u16, f64} + create_benchmark! {group, "f64_to_i32_optimized", ffti::f64_to_i32, f64} + create_benchmark! {group, "f64_to_u32_optimized", ffti::f64_to_u32, f64} + create_benchmark! {group, "f64_to_i64_optimized", ffti::f64_to_i64, f64} + create_benchmark! {group, "f64_to_u64_optimized", ffti::f64_to_u64, f64} + create_benchmark! {group, "f64_to_i128_optimized", ffti::f64_to_i128, f64} + create_benchmark! {group, "f64_to_u128_optimized", ffti::f64_to_u128, f64} +} + +criterion_group!(benches, benchmark); +criterion_main!(benches); diff --git a/changelog.md b/changelog.md new file mode 100644 index 0000000..97a82e0 --- /dev/null +++ b/changelog.md @@ -0,0 +1,5 @@ +## unreleased + +## 0.1.0 - 2024-11-10 + +- initial release diff --git a/generated assembly/x86_64_default/f32_to_i128 b/generated assembly/x86_64_default/f32_to_i128 new file mode 100644 index 0000000..32ca751 --- /dev/null +++ b/generated assembly/x86_64_default/f32_to_i128 @@ -0,0 +1,20 @@ +fast_float_to_integer::f32_to_i128: + push rax + movss dword ptr [rsp + 4], xmm0 + call qword ptr [rip + __fixsfti@GOTPCREL] + xor ecx, ecx + movss xmm0, dword ptr [rsp + 4] + ucomiss xmm0, dword ptr [rip + .L_0] + cmovb rax, rcx + movabs rsi, -9223372036854775808 + cmovb rdx, rsi + ucomiss xmm0, dword ptr [rip + .L_1] + movabs rsi, 9223372036854775807 + cmova rdx, rsi + mov rsi, -1 + cmova rax, rsi + ucomiss xmm0, xmm0 + cmovp rax, rcx + cmovp rdx, rcx + pop rcx + ret diff --git a/generated assembly/x86_64_default/f32_to_i16 b/generated assembly/x86_64_default/f32_to_i16 new file mode 100644 index 0000000..23b3b1d --- /dev/null +++ b/generated assembly/x86_64_default/f32_to_i16 @@ -0,0 +1,7 @@ +fast_float_to_integer::f32_to_i16: + movss xmm1, dword ptr [rip + .L_0] + maxss xmm1, xmm0 + movss xmm0, dword ptr [rip + .L_1] + minss xmm0, xmm1 + cvttss2si eax, xmm0 + ret diff --git a/generated assembly/x86_64_default/f32_to_i32 b/generated assembly/x86_64_default/f32_to_i32 new file mode 100644 index 0000000..9f69c31 --- /dev/null +++ b/generated assembly/x86_64_default/f32_to_i32 @@ -0,0 +1,9 @@ +fast_float_to_integer::f32_to_i32: + cvttss2si eax, xmm0 + ucomiss xmm0, dword ptr [rip + .L_0] + mov ecx, 2147483647 + cmovbe ecx, eax + xor eax, eax + ucomiss xmm0, xmm0 + cmovnp eax, ecx + ret diff --git a/generated assembly/x86_64_default/f32_to_i64 b/generated assembly/x86_64_default/f32_to_i64 new file mode 100644 index 0000000..24a73f4 --- /dev/null +++ b/generated assembly/x86_64_default/f32_to_i64 @@ -0,0 +1,9 @@ +fast_float_to_integer::f32_to_i64: + cvttss2si rax, xmm0 + ucomiss xmm0, dword ptr [rip + .L_0] + movabs rcx, 9223372036854775807 + cmovbe rcx, rax + xor eax, eax + ucomiss xmm0, xmm0 + cmovnp rax, rcx + ret diff --git a/generated assembly/x86_64_default/f32_to_i8 b/generated assembly/x86_64_default/f32_to_i8 new file mode 100644 index 0000000..0c9e7a5 --- /dev/null +++ b/generated assembly/x86_64_default/f32_to_i8 @@ -0,0 +1,7 @@ +fast_float_to_integer::f32_to_i8: + movss xmm1, dword ptr [rip + .L_0] + maxss xmm1, xmm0 + movss xmm0, dword ptr [rip + .L_1] + minss xmm0, xmm1 + cvttss2si eax, xmm0 + ret diff --git a/generated assembly/x86_64_default/f32_to_u128 b/generated assembly/x86_64_default/f32_to_u128 new file mode 100644 index 0000000..776bcb6 --- /dev/null +++ b/generated assembly/x86_64_default/f32_to_u128 @@ -0,0 +1,16 @@ +fast_float_to_integer::f32_to_u128: + push rax + movss dword ptr [rsp + 4], xmm0 + call qword ptr [rip + __fixunssfti@GOTPCREL] + xor ecx, ecx + xorps xmm0, xmm0 + movss xmm1, dword ptr [rsp + 4] + ucomiss xmm1, xmm0 + cmovb rdx, rcx + cmovb rax, rcx + ucomiss xmm1, dword ptr [rip + .L_0] + mov rcx, -1 + cmova rax, rcx + cmova rdx, rcx + pop rcx + ret diff --git a/generated assembly/x86_64_default/f32_to_u16 b/generated assembly/x86_64_default/f32_to_u16 new file mode 100644 index 0000000..224035e --- /dev/null +++ b/generated assembly/x86_64_default/f32_to_u16 @@ -0,0 +1,7 @@ +fast_float_to_integer::f32_to_u16: + xorps xmm1, xmm1 + maxss xmm1, xmm0 + movss xmm0, dword ptr [rip + .L_0] + minss xmm0, xmm1 + cvttss2si eax, xmm0 + ret diff --git a/generated assembly/x86_64_default/f32_to_u32 b/generated assembly/x86_64_default/f32_to_u32 new file mode 100644 index 0000000..7a60977 --- /dev/null +++ b/generated assembly/x86_64_default/f32_to_u32 @@ -0,0 +1,10 @@ +fast_float_to_integer::f32_to_u32: + cvttss2si rax, xmm0 + xor ecx, ecx + xorps xmm1, xmm1 + ucomiss xmm0, xmm1 + cmovae ecx, eax + ucomiss xmm0, dword ptr [rip + .L_0] + mov eax, -1 + cmovbe eax, ecx + ret diff --git a/generated assembly/x86_64_default/f32_to_u64 b/generated assembly/x86_64_default/f32_to_u64 new file mode 100644 index 0000000..b290d52 --- /dev/null +++ b/generated assembly/x86_64_default/f32_to_u64 @@ -0,0 +1,17 @@ +fast_float_to_integer::f32_to_u64: + cvttss2si rax, xmm0 + mov rcx, rax + sar rcx, 63 + movaps xmm1, xmm0 + subss xmm1, dword ptr [rip + .L_0] + cvttss2si rdx, xmm1 + and rdx, rcx + or rdx, rax + xor ecx, ecx + xorps xmm1, xmm1 + ucomiss xmm0, xmm1 + cmovae rcx, rdx + ucomiss xmm0, dword ptr [rip + .L_1] + mov rax, -1 + cmovbe rax, rcx + ret diff --git a/generated assembly/x86_64_default/f32_to_u8 b/generated assembly/x86_64_default/f32_to_u8 new file mode 100644 index 0000000..65668ed --- /dev/null +++ b/generated assembly/x86_64_default/f32_to_u8 @@ -0,0 +1,7 @@ +fast_float_to_integer::f32_to_u8: + xorps xmm1, xmm1 + maxss xmm1, xmm0 + movss xmm0, dword ptr [rip + .L_0] + minss xmm0, xmm1 + cvttss2si eax, xmm0 + ret diff --git a/generated assembly/x86_64_default/f64_to_i128 b/generated assembly/x86_64_default/f64_to_i128 new file mode 100644 index 0000000..da89a58 --- /dev/null +++ b/generated assembly/x86_64_default/f64_to_i128 @@ -0,0 +1,20 @@ +fast_float_to_integer::f64_to_i128: + push rax + movsd qword ptr [rsp], xmm0 + call qword ptr [rip + __fixdfti@GOTPCREL] + xor ecx, ecx + movsd xmm0, qword ptr [rsp] + ucomisd xmm0, qword ptr [rip + .L_0] + cmovb rax, rcx + movabs rsi, -9223372036854775808 + cmovb rdx, rsi + ucomisd xmm0, qword ptr [rip + .L_1] + movabs rsi, 9223372036854775807 + cmova rdx, rsi + mov rsi, -1 + cmova rax, rsi + ucomisd xmm0, xmm0 + cmovp rax, rcx + cmovp rdx, rcx + pop rcx + ret diff --git a/generated assembly/x86_64_default/f64_to_i16 b/generated assembly/x86_64_default/f64_to_i16 new file mode 100644 index 0000000..0dd6131 --- /dev/null +++ b/generated assembly/x86_64_default/f64_to_i16 @@ -0,0 +1,7 @@ +fast_float_to_integer::f64_to_i16: + movsd xmm1, qword ptr [rip + .L_0] + maxsd xmm1, xmm0 + movsd xmm0, qword ptr [rip + .L_1] + minsd xmm0, xmm1 + cvttsd2si eax, xmm0 + ret diff --git a/generated assembly/x86_64_default/f64_to_i32 b/generated assembly/x86_64_default/f64_to_i32 new file mode 100644 index 0000000..9091e35 --- /dev/null +++ b/generated assembly/x86_64_default/f64_to_i32 @@ -0,0 +1,8 @@ +fast_float_to_integer::f64_to_i32: + xor eax, eax + ucomisd xmm0, xmm0 + maxsd xmm0, qword ptr [rip + .L_0] + minsd xmm0, qword ptr [rip + .L_1] + cvttsd2si ecx, xmm0 + cmovnp eax, ecx + ret diff --git a/generated assembly/x86_64_default/f64_to_i64 b/generated assembly/x86_64_default/f64_to_i64 new file mode 100644 index 0000000..c877c69 --- /dev/null +++ b/generated assembly/x86_64_default/f64_to_i64 @@ -0,0 +1,9 @@ +fast_float_to_integer::f64_to_i64: + cvttsd2si rax, xmm0 + ucomisd xmm0, qword ptr [rip + .L_0] + movabs rcx, 9223372036854775807 + cmovbe rcx, rax + xor eax, eax + ucomisd xmm0, xmm0 + cmovnp rax, rcx + ret diff --git a/generated assembly/x86_64_default/f64_to_i8 b/generated assembly/x86_64_default/f64_to_i8 new file mode 100644 index 0000000..80cfe20 --- /dev/null +++ b/generated assembly/x86_64_default/f64_to_i8 @@ -0,0 +1,7 @@ +fast_float_to_integer::f64_to_i8: + movsd xmm1, qword ptr [rip + .L_0] + maxsd xmm1, xmm0 + movsd xmm0, qword ptr [rip + .L_1] + minsd xmm0, xmm1 + cvttsd2si eax, xmm0 + ret diff --git a/generated assembly/x86_64_default/f64_to_u128 b/generated assembly/x86_64_default/f64_to_u128 new file mode 100644 index 0000000..3d74896 --- /dev/null +++ b/generated assembly/x86_64_default/f64_to_u128 @@ -0,0 +1,16 @@ +fast_float_to_integer::f64_to_u128: + push rax + movsd qword ptr [rsp], xmm0 + call qword ptr [rip + __fixunsdfti@GOTPCREL] + xor ecx, ecx + xorpd xmm0, xmm0 + movsd xmm1, qword ptr [rsp] + ucomisd xmm1, xmm0 + cmovb rdx, rcx + cmovb rax, rcx + ucomisd xmm1, qword ptr [rip + .L_0] + mov rcx, -1 + cmova rax, rcx + cmova rdx, rcx + pop rcx + ret diff --git a/generated assembly/x86_64_default/f64_to_u16 b/generated assembly/x86_64_default/f64_to_u16 new file mode 100644 index 0000000..36aa825 --- /dev/null +++ b/generated assembly/x86_64_default/f64_to_u16 @@ -0,0 +1,7 @@ +fast_float_to_integer::f64_to_u16: + xorpd xmm1, xmm1 + maxsd xmm1, xmm0 + movsd xmm0, qword ptr [rip + .L_0] + minsd xmm0, xmm1 + cvttsd2si eax, xmm0 + ret diff --git a/generated assembly/x86_64_default/f64_to_u32 b/generated assembly/x86_64_default/f64_to_u32 new file mode 100644 index 0000000..abada1f --- /dev/null +++ b/generated assembly/x86_64_default/f64_to_u32 @@ -0,0 +1,7 @@ +fast_float_to_integer::f64_to_u32: + xorpd xmm1, xmm1 + maxsd xmm1, xmm0 + movsd xmm0, qword ptr [rip + .L_0] + minsd xmm0, xmm1 + cvttsd2si rax, xmm0 + ret diff --git a/generated assembly/x86_64_default/f64_to_u64 b/generated assembly/x86_64_default/f64_to_u64 new file mode 100644 index 0000000..ec633ac --- /dev/null +++ b/generated assembly/x86_64_default/f64_to_u64 @@ -0,0 +1,17 @@ +fast_float_to_integer::f64_to_u64: + cvttsd2si rax, xmm0 + mov rcx, rax + sar rcx, 63 + movapd xmm1, xmm0 + subsd xmm1, qword ptr [rip + .L_0] + cvttsd2si rdx, xmm1 + and rdx, rcx + or rdx, rax + xor ecx, ecx + xorpd xmm1, xmm1 + ucomisd xmm0, xmm1 + cmovae rcx, rdx + ucomisd xmm0, qword ptr [rip + .L_1] + mov rax, -1 + cmovbe rax, rcx + ret diff --git a/generated assembly/x86_64_default/f64_to_u8 b/generated assembly/x86_64_default/f64_to_u8 new file mode 100644 index 0000000..4143db8 --- /dev/null +++ b/generated assembly/x86_64_default/f64_to_u8 @@ -0,0 +1,7 @@ +fast_float_to_integer::f64_to_u8: + xorpd xmm1, xmm1 + maxsd xmm1, xmm0 + movsd xmm0, qword ptr [rip + .L_0] + minsd xmm0, xmm1 + cvttsd2si eax, xmm0 + ret diff --git a/generated assembly/x86_64_sse/f32_to_i128 b/generated assembly/x86_64_sse/f32_to_i128 new file mode 100644 index 0000000..32ca751 --- /dev/null +++ b/generated assembly/x86_64_sse/f32_to_i128 @@ -0,0 +1,20 @@ +fast_float_to_integer::f32_to_i128: + push rax + movss dword ptr [rsp + 4], xmm0 + call qword ptr [rip + __fixsfti@GOTPCREL] + xor ecx, ecx + movss xmm0, dword ptr [rsp + 4] + ucomiss xmm0, dword ptr [rip + .L_0] + cmovb rax, rcx + movabs rsi, -9223372036854775808 + cmovb rdx, rsi + ucomiss xmm0, dword ptr [rip + .L_1] + movabs rsi, 9223372036854775807 + cmova rdx, rsi + mov rsi, -1 + cmova rax, rsi + ucomiss xmm0, xmm0 + cmovp rax, rcx + cmovp rdx, rcx + pop rcx + ret diff --git a/generated assembly/x86_64_sse/f32_to_i16 b/generated assembly/x86_64_sse/f32_to_i16 new file mode 100644 index 0000000..de7b2e5 --- /dev/null +++ b/generated assembly/x86_64_sse/f32_to_i16 @@ -0,0 +1,3 @@ +fast_float_to_integer::f32_to_i16: + cvttss2si rax, xmm0 + ret diff --git a/generated assembly/x86_64_sse/f32_to_i32 b/generated assembly/x86_64_sse/f32_to_i32 new file mode 100644 index 0000000..44e3ec2 --- /dev/null +++ b/generated assembly/x86_64_sse/f32_to_i32 @@ -0,0 +1,3 @@ +fast_float_to_integer::f32_to_i32: + cvttss2si rax, xmm0 + ret diff --git a/generated assembly/x86_64_sse/f32_to_i64 b/generated assembly/x86_64_sse/f32_to_i64 new file mode 100644 index 0000000..f410ee4 --- /dev/null +++ b/generated assembly/x86_64_sse/f32_to_i64 @@ -0,0 +1,3 @@ +fast_float_to_integer::f32_to_i64: + cvttss2si rax, xmm0 + ret diff --git a/generated assembly/x86_64_sse/f32_to_i8 b/generated assembly/x86_64_sse/f32_to_i8 new file mode 100644 index 0000000..62f9dde --- /dev/null +++ b/generated assembly/x86_64_sse/f32_to_i8 @@ -0,0 +1,3 @@ +fast_float_to_integer::f32_to_i8: + cvttss2si rax, xmm0 + ret diff --git a/generated assembly/x86_64_sse/f32_to_u128 b/generated assembly/x86_64_sse/f32_to_u128 new file mode 100644 index 0000000..776bcb6 --- /dev/null +++ b/generated assembly/x86_64_sse/f32_to_u128 @@ -0,0 +1,16 @@ +fast_float_to_integer::f32_to_u128: + push rax + movss dword ptr [rsp + 4], xmm0 + call qword ptr [rip + __fixunssfti@GOTPCREL] + xor ecx, ecx + xorps xmm0, xmm0 + movss xmm1, dword ptr [rsp + 4] + ucomiss xmm1, xmm0 + cmovb rdx, rcx + cmovb rax, rcx + ucomiss xmm1, dword ptr [rip + .L_0] + mov rcx, -1 + cmova rax, rcx + cmova rdx, rcx + pop rcx + ret diff --git a/generated assembly/x86_64_sse/f32_to_u16 b/generated assembly/x86_64_sse/f32_to_u16 new file mode 100644 index 0000000..2dadc0d --- /dev/null +++ b/generated assembly/x86_64_sse/f32_to_u16 @@ -0,0 +1,3 @@ +fast_float_to_integer::f32_to_u16: + cvttss2si rax, xmm0 + ret diff --git a/generated assembly/x86_64_sse/f32_to_u32 b/generated assembly/x86_64_sse/f32_to_u32 new file mode 100644 index 0000000..da7575c --- /dev/null +++ b/generated assembly/x86_64_sse/f32_to_u32 @@ -0,0 +1,3 @@ +fast_float_to_integer::f32_to_u32: + cvttss2si rax, xmm0 + ret diff --git a/generated assembly/x86_64_sse/f32_to_u64 b/generated assembly/x86_64_sse/f32_to_u64 new file mode 100644 index 0000000..8a8988d --- /dev/null +++ b/generated assembly/x86_64_sse/f32_to_u64 @@ -0,0 +1,9 @@ +fast_float_to_integer::f32_to_u64: + cvttss2si rcx, xmm0 + addss xmm0, dword ptr [rip + .L_0] + cvttss2si rdx, xmm0 + mov rax, rcx + sar rax, 63 + and rax, rdx + or rax, rcx + ret diff --git a/generated assembly/x86_64_sse/f32_to_u8 b/generated assembly/x86_64_sse/f32_to_u8 new file mode 100644 index 0000000..c6b17aa --- /dev/null +++ b/generated assembly/x86_64_sse/f32_to_u8 @@ -0,0 +1,3 @@ +fast_float_to_integer::f32_to_u8: + cvttss2si rax, xmm0 + ret diff --git a/generated assembly/x86_64_sse/f64_to_i128 b/generated assembly/x86_64_sse/f64_to_i128 new file mode 100644 index 0000000..da89a58 --- /dev/null +++ b/generated assembly/x86_64_sse/f64_to_i128 @@ -0,0 +1,20 @@ +fast_float_to_integer::f64_to_i128: + push rax + movsd qword ptr [rsp], xmm0 + call qword ptr [rip + __fixdfti@GOTPCREL] + xor ecx, ecx + movsd xmm0, qword ptr [rsp] + ucomisd xmm0, qword ptr [rip + .L_0] + cmovb rax, rcx + movabs rsi, -9223372036854775808 + cmovb rdx, rsi + ucomisd xmm0, qword ptr [rip + .L_1] + movabs rsi, 9223372036854775807 + cmova rdx, rsi + mov rsi, -1 + cmova rax, rsi + ucomisd xmm0, xmm0 + cmovp rax, rcx + cmovp rdx, rcx + pop rcx + ret diff --git a/generated assembly/x86_64_sse/f64_to_i16 b/generated assembly/x86_64_sse/f64_to_i16 new file mode 100644 index 0000000..c7d2077 --- /dev/null +++ b/generated assembly/x86_64_sse/f64_to_i16 @@ -0,0 +1,3 @@ +fast_float_to_integer::f64_to_i16: + cvttsd2si rax, xmm0 + ret diff --git a/generated assembly/x86_64_sse/f64_to_i32 b/generated assembly/x86_64_sse/f64_to_i32 new file mode 100644 index 0000000..48cc827 --- /dev/null +++ b/generated assembly/x86_64_sse/f64_to_i32 @@ -0,0 +1,3 @@ +fast_float_to_integer::f64_to_i32: + cvttsd2si rax, xmm0 + ret diff --git a/generated assembly/x86_64_sse/f64_to_i64 b/generated assembly/x86_64_sse/f64_to_i64 new file mode 100644 index 0000000..2e43ff5 --- /dev/null +++ b/generated assembly/x86_64_sse/f64_to_i64 @@ -0,0 +1,3 @@ +fast_float_to_integer::f64_to_i64: + cvttsd2si rax, xmm0 + ret diff --git a/generated assembly/x86_64_sse/f64_to_i8 b/generated assembly/x86_64_sse/f64_to_i8 new file mode 100644 index 0000000..2bfd2dd --- /dev/null +++ b/generated assembly/x86_64_sse/f64_to_i8 @@ -0,0 +1,3 @@ +fast_float_to_integer::f64_to_i8: + cvttsd2si rax, xmm0 + ret diff --git a/generated assembly/x86_64_sse/f64_to_u128 b/generated assembly/x86_64_sse/f64_to_u128 new file mode 100644 index 0000000..3d74896 --- /dev/null +++ b/generated assembly/x86_64_sse/f64_to_u128 @@ -0,0 +1,16 @@ +fast_float_to_integer::f64_to_u128: + push rax + movsd qword ptr [rsp], xmm0 + call qword ptr [rip + __fixunsdfti@GOTPCREL] + xor ecx, ecx + xorpd xmm0, xmm0 + movsd xmm1, qword ptr [rsp] + ucomisd xmm1, xmm0 + cmovb rdx, rcx + cmovb rax, rcx + ucomisd xmm1, qword ptr [rip + .L_0] + mov rcx, -1 + cmova rax, rcx + cmova rdx, rcx + pop rcx + ret diff --git a/generated assembly/x86_64_sse/f64_to_u16 b/generated assembly/x86_64_sse/f64_to_u16 new file mode 100644 index 0000000..5b3e66f --- /dev/null +++ b/generated assembly/x86_64_sse/f64_to_u16 @@ -0,0 +1,3 @@ +fast_float_to_integer::f64_to_u16: + cvttsd2si rax, xmm0 + ret diff --git a/generated assembly/x86_64_sse/f64_to_u32 b/generated assembly/x86_64_sse/f64_to_u32 new file mode 100644 index 0000000..6dd06f2 --- /dev/null +++ b/generated assembly/x86_64_sse/f64_to_u32 @@ -0,0 +1,3 @@ +fast_float_to_integer::f64_to_u32: + cvttsd2si rax, xmm0 + ret diff --git a/generated assembly/x86_64_sse/f64_to_u64 b/generated assembly/x86_64_sse/f64_to_u64 new file mode 100644 index 0000000..a7a9efc --- /dev/null +++ b/generated assembly/x86_64_sse/f64_to_u64 @@ -0,0 +1,9 @@ +fast_float_to_integer::f64_to_u64: + cvttsd2si rcx, xmm0 + addsd xmm0, qword ptr [rip + .L_0] + cvttsd2si rdx, xmm0 + mov rax, rcx + sar rax, 63 + and rax, rdx + or rax, rcx + ret diff --git a/generated assembly/x86_64_sse/f64_to_u8 b/generated assembly/x86_64_sse/f64_to_u8 new file mode 100644 index 0000000..01a71e0 --- /dev/null +++ b/generated assembly/x86_64_sse/f64_to_u8 @@ -0,0 +1,3 @@ +fast_float_to_integer::f64_to_u8: + cvttsd2si rax, xmm0 + ret diff --git a/generated assembly/x86_sse/f32_to_i128 b/generated assembly/x86_sse/f32_to_i128 new file mode 100644 index 0000000..eacb1bb --- /dev/null +++ b/generated assembly/x86_sse/f32_to_i128 @@ -0,0 +1,52 @@ +fast_float_to_integer::f32_to_i128: + push ebp + push ebx + push edi + push esi + sub esp, 44 + movss xmm0, dword ptr [esp + 68] + mov esi, dword ptr [esp + 64] + call .L_0$pb +.L_0$pb: + pop ebx + lea eax, [esp + 16] +.L_1: + add ebx, offset _GLOBAL_OFFSET_TABLE_+(.L_1-.L_0$pb) + mov dword ptr [esp], eax + movss dword ptr [esp + 4], xmm0 + call __fixsfti@PLT + sub esp, 4 + movss xmm0, dword ptr [esp + 68] + xor ecx, ecx + mov eax, dword ptr [esp + 16] + mov edx, dword ptr [esp + 20] + mov edi, dword ptr [esp + 24] + mov ebp, -2147483648 + ucomiss xmm0, dword ptr [ebx + .L_2@GOTOFF] + cmovb eax, ecx + cmovb edx, ecx + cmovb edi, ecx + cmovae ebp, dword ptr [esp + 28] + ucomiss xmm0, dword ptr [ebx + .L_3@GOTOFF] + mov ebx, 2147483647 + cmovbe ebx, ebp + mov ebp, -1 + cmova edi, ebp + cmova edx, ebp + cmova eax, ebp + ucomiss xmm0, xmm0 + cmovp ebx, ecx + cmovp eax, ecx + cmovp edx, ecx + cmovp edi, ecx + mov dword ptr [esi + 12], ebx + mov dword ptr [esi + 8], edi + mov dword ptr [esi + 4], edx + mov dword ptr [esi], eax + mov eax, esi + add esp, 44 + pop esi + pop edi + pop ebx + pop ebp + ret 4 diff --git a/generated assembly/x86_sse/f32_to_i16 b/generated assembly/x86_sse/f32_to_i16 new file mode 100644 index 0000000..b3794ac --- /dev/null +++ b/generated assembly/x86_sse/f32_to_i16 @@ -0,0 +1,3 @@ +fast_float_to_integer::f32_to_i16: + cvttss2si eax, dword ptr [esp + 4] + ret diff --git a/generated assembly/x86_sse/f32_to_i32 b/generated assembly/x86_sse/f32_to_i32 new file mode 100644 index 0000000..2f21036 --- /dev/null +++ b/generated assembly/x86_sse/f32_to_i32 @@ -0,0 +1,3 @@ +fast_float_to_integer::f32_to_i32: + cvttss2si eax, dword ptr [esp + 4] + ret diff --git a/generated assembly/x86_sse/f32_to_i64 b/generated assembly/x86_sse/f32_to_i64 new file mode 100644 index 0000000..4dc7dbf --- /dev/null +++ b/generated assembly/x86_sse/f32_to_i64 @@ -0,0 +1,37 @@ +fast_float_to_integer::f32_to_i64: + push edi + push esi + sub esp, 20 + movss xmm0, dword ptr [esp + 32] + call .L_0$pb +.L_0$pb: + pop eax + mov edi, -2147483648 + mov edx, 2147483647 +.L_1: + add eax, offset _GLOBAL_OFFSET_TABLE_+(.L_1-.L_0$pb) + movss dword ptr [esp + 8], xmm0 + fld dword ptr [esp + 8] + fnstcw word ptr [esp + 4] + movzx ecx, word ptr [esp + 4] + or ecx, 3072 + mov word ptr [esp + 6], cx + xor ecx, ecx + ucomiss xmm0, dword ptr [eax + .L_2@GOTOFF] + fldcw word ptr [esp + 6] + fistp qword ptr [esp + 8] + fldcw word ptr [esp + 4] + mov esi, dword ptr [esp + 8] + cmovae edi, dword ptr [esp + 12] + cmovb esi, ecx + ucomiss xmm0, dword ptr [eax + .L_3@GOTOFF] + mov eax, -1 + cmovbe edx, edi + cmovbe eax, esi + ucomiss xmm0, xmm0 + cmovp eax, ecx + cmovp edx, ecx + add esp, 20 + pop esi + pop edi + ret diff --git a/generated assembly/x86_sse/f32_to_i8 b/generated assembly/x86_sse/f32_to_i8 new file mode 100644 index 0000000..f684c72 --- /dev/null +++ b/generated assembly/x86_sse/f32_to_i8 @@ -0,0 +1,3 @@ +fast_float_to_integer::f32_to_i8: + cvttss2si eax, dword ptr [esp + 4] + ret diff --git a/generated assembly/x86_sse/f32_to_u128 b/generated assembly/x86_sse/f32_to_u128 new file mode 100644 index 0000000..0de8438 --- /dev/null +++ b/generated assembly/x86_sse/f32_to_u128 @@ -0,0 +1,47 @@ +fast_float_to_integer::f32_to_u128: + push ebx + push edi + push esi + sub esp, 32 + movss xmm0, dword ptr [esp + 52] + mov esi, dword ptr [esp + 48] + call .L_0$pb +.L_0$pb: + pop ebx + lea eax, [esp + 16] +.L_1: + add ebx, offset _GLOBAL_OFFSET_TABLE_+(.L_1-.L_0$pb) + mov dword ptr [esp], eax + movss dword ptr [esp + 4], xmm0 + call __fixunssfti@PLT + sub esp, 4 + movss xmm1, dword ptr [esp + 52] + xorps xmm0, xmm0 + xor eax, eax + mov ecx, 0 + mov edx, 0 + mov edi, 0 + ucomiss xmm1, xmm0 + movaps xmm0, xmm1 + jb .L_2 + mov eax, dword ptr [esp + 28] + mov ecx, dword ptr [esp + 24] + mov edx, dword ptr [esp + 20] + mov edi, dword ptr [esp + 16] +.L_2: + ucomiss xmm0, dword ptr [ebx + .L_3@GOTOFF] + mov ebx, -1 + cmova eax, ebx + cmova edi, ebx + cmova edx, ebx + cmova ecx, ebx + mov dword ptr [esi + 12], eax + mov dword ptr [esi + 8], ecx + mov dword ptr [esi + 4], edx + mov dword ptr [esi], edi + mov eax, esi + add esp, 32 + pop esi + pop edi + pop ebx + ret 4 diff --git a/generated assembly/x86_sse/f32_to_u16 b/generated assembly/x86_sse/f32_to_u16 new file mode 100644 index 0000000..d5e72f8 --- /dev/null +++ b/generated assembly/x86_sse/f32_to_u16 @@ -0,0 +1,3 @@ +fast_float_to_integer::f32_to_u16: + cvttss2si eax, dword ptr [esp + 4] + ret diff --git a/generated assembly/x86_sse/f32_to_u32 b/generated assembly/x86_sse/f32_to_u32 new file mode 100644 index 0000000..7c88ff0 --- /dev/null +++ b/generated assembly/x86_sse/f32_to_u32 @@ -0,0 +1,15 @@ +fast_float_to_integer::f32_to_u32: + movss xmm0, dword ptr [esp + 4] + call .L_0$pb +.L_0$pb: + pop eax +.L_1: + add eax, offset _GLOBAL_OFFSET_TABLE_+(.L_1-.L_0$pb) + cvttss2si ecx, xmm0 + addss xmm0, dword ptr [eax + .L_2@GOTOFF] + mov eax, ecx + sar eax, 31 + cvttss2si edx, xmm0 + and eax, edx + or eax, ecx + ret diff --git a/generated assembly/x86_sse/f32_to_u64 b/generated assembly/x86_sse/f32_to_u64 new file mode 100644 index 0000000..bc4d1e8 --- /dev/null +++ b/generated assembly/x86_sse/f32_to_u64 @@ -0,0 +1,43 @@ +fast_float_to_integer::f32_to_u64: + push ebx + sub esp, 16 + call .L_0$pb +.L_0$pb: + pop ecx + movss xmm0, dword ptr [esp + 24] + xorps xmm1, xmm1 +.L_1: + add ecx, offset _GLOBAL_OFFSET_TABLE_+(.L_1-.L_0$pb) + movss xmm2, dword ptr [ecx + .L_2@GOTOFF] + ucomiss xmm2, xmm0 + jbe .L_3 + xorps xmm2, xmm2 +.L_3: + movaps xmm3, xmm0 + setbe bl + xor edx, edx + subss xmm3, xmm2 + movss dword ptr [esp + 8], xmm3 + fld dword ptr [esp + 8] + fnstcw word ptr [esp + 4] + movzx eax, word ptr [esp + 4] + or eax, 3072 + ucomiss xmm0, xmm1 + mov word ptr [esp + 6], ax + mov eax, 0 + fldcw word ptr [esp + 6] + fistp qword ptr [esp + 8] + fldcw word ptr [esp + 4] + jb .L_4 + movzx edx, bl + mov eax, dword ptr [esp + 8] + shl edx, 31 + xor edx, dword ptr [esp + 12] +.L_4: + ucomiss xmm0, dword ptr [ecx + .L_5@GOTOFF] + mov ecx, -1 + cmova edx, ecx + cmova eax, ecx + add esp, 16 + pop ebx + ret diff --git a/generated assembly/x86_sse/f32_to_u8 b/generated assembly/x86_sse/f32_to_u8 new file mode 100644 index 0000000..f9d9c0c --- /dev/null +++ b/generated assembly/x86_sse/f32_to_u8 @@ -0,0 +1,3 @@ +fast_float_to_integer::f32_to_u8: + cvttss2si eax, dword ptr [esp + 4] + ret diff --git a/generated assembly/x86_sse/f64_to_i128 b/generated assembly/x86_sse/f64_to_i128 new file mode 100644 index 0000000..ec6b8a6 --- /dev/null +++ b/generated assembly/x86_sse/f64_to_i128 @@ -0,0 +1,52 @@ +fast_float_to_integer::f64_to_i128: + push ebp + push ebx + push edi + push esi + sub esp, 44 + movsd xmm0, qword ptr [esp + 68] + mov esi, dword ptr [esp + 64] + call .L_0$pb +.L_0$pb: + pop ebx + lea eax, [esp + 16] +.L_1: + add ebx, offset _GLOBAL_OFFSET_TABLE_+(.L_1-.L_0$pb) + mov dword ptr [esp], eax + movsd qword ptr [esp + 4], xmm0 + call __fixdfti@PLT + sub esp, 4 + movsd xmm0, qword ptr [esp + 68] + xor ecx, ecx + mov eax, dword ptr [esp + 16] + mov edx, dword ptr [esp + 20] + mov edi, dword ptr [esp + 24] + mov ebp, -2147483648 + ucomisd xmm0, qword ptr [ebx + .L_2@GOTOFF] + cmovb eax, ecx + cmovb edx, ecx + cmovb edi, ecx + cmovae ebp, dword ptr [esp + 28] + ucomisd xmm0, qword ptr [ebx + .L_3@GOTOFF] + mov ebx, 2147483647 + cmovbe ebx, ebp + mov ebp, -1 + cmova edi, ebp + cmova edx, ebp + cmova eax, ebp + ucomisd xmm0, xmm0 + cmovp ebx, ecx + cmovp eax, ecx + cmovp edx, ecx + cmovp edi, ecx + mov dword ptr [esi + 12], ebx + mov dword ptr [esi + 8], edi + mov dword ptr [esi + 4], edx + mov dword ptr [esi], eax + mov eax, esi + add esp, 44 + pop esi + pop edi + pop ebx + pop ebp + ret 4 diff --git a/generated assembly/x86_sse/f64_to_i16 b/generated assembly/x86_sse/f64_to_i16 new file mode 100644 index 0000000..1dce409 --- /dev/null +++ b/generated assembly/x86_sse/f64_to_i16 @@ -0,0 +1,3 @@ +fast_float_to_integer::f64_to_i16: + cvttsd2si eax, qword ptr [esp + 4] + ret diff --git a/generated assembly/x86_sse/f64_to_i32 b/generated assembly/x86_sse/f64_to_i32 new file mode 100644 index 0000000..73530e5 --- /dev/null +++ b/generated assembly/x86_sse/f64_to_i32 @@ -0,0 +1,3 @@ +fast_float_to_integer::f64_to_i32: + cvttsd2si eax, qword ptr [esp + 4] + ret diff --git a/generated assembly/x86_sse/f64_to_i64 b/generated assembly/x86_sse/f64_to_i64 new file mode 100644 index 0000000..7cac98b --- /dev/null +++ b/generated assembly/x86_sse/f64_to_i64 @@ -0,0 +1,37 @@ +fast_float_to_integer::f64_to_i64: + push edi + push esi + sub esp, 20 + movsd xmm0, qword ptr [esp + 32] + call .L_0$pb +.L_0$pb: + pop eax + mov edi, -2147483648 + mov edx, 2147483647 +.L_1: + add eax, offset _GLOBAL_OFFSET_TABLE_+(.L_1-.L_0$pb) + movsd qword ptr [esp + 8], xmm0 + fld qword ptr [esp + 8] + fnstcw word ptr [esp + 4] + movzx ecx, word ptr [esp + 4] + or ecx, 3072 + mov word ptr [esp + 6], cx + xor ecx, ecx + ucomisd xmm0, qword ptr [eax + .L_2@GOTOFF] + fldcw word ptr [esp + 6] + fistp qword ptr [esp + 8] + fldcw word ptr [esp + 4] + mov esi, dword ptr [esp + 8] + cmovae edi, dword ptr [esp + 12] + cmovb esi, ecx + ucomisd xmm0, qword ptr [eax + .L_3@GOTOFF] + mov eax, -1 + cmovbe edx, edi + cmovbe eax, esi + ucomisd xmm0, xmm0 + cmovp eax, ecx + cmovp edx, ecx + add esp, 20 + pop esi + pop edi + ret diff --git a/generated assembly/x86_sse/f64_to_i8 b/generated assembly/x86_sse/f64_to_i8 new file mode 100644 index 0000000..4fcb1bf --- /dev/null +++ b/generated assembly/x86_sse/f64_to_i8 @@ -0,0 +1,3 @@ +fast_float_to_integer::f64_to_i8: + cvttsd2si eax, qword ptr [esp + 4] + ret diff --git a/generated assembly/x86_sse/f64_to_u128 b/generated assembly/x86_sse/f64_to_u128 new file mode 100644 index 0000000..4d58d05 --- /dev/null +++ b/generated assembly/x86_sse/f64_to_u128 @@ -0,0 +1,47 @@ +fast_float_to_integer::f64_to_u128: + push ebx + push edi + push esi + sub esp, 32 + movsd xmm0, qword ptr [esp + 52] + mov esi, dword ptr [esp + 48] + call .L_0$pb +.L_0$pb: + pop ebx + lea eax, [esp + 16] +.L_1: + add ebx, offset _GLOBAL_OFFSET_TABLE_+(.L_1-.L_0$pb) + mov dword ptr [esp], eax + movsd qword ptr [esp + 4], xmm0 + call __fixunsdfti@PLT + sub esp, 4 + movsd xmm1, qword ptr [esp + 52] + xorpd xmm0, xmm0 + xor eax, eax + mov ecx, 0 + mov edx, 0 + mov edi, 0 + ucomisd xmm1, xmm0 + movapd xmm0, xmm1 + jb .L_2 + mov eax, dword ptr [esp + 28] + mov ecx, dword ptr [esp + 24] + mov edx, dword ptr [esp + 20] + mov edi, dword ptr [esp + 16] +.L_2: + ucomisd xmm0, qword ptr [ebx + .L_3@GOTOFF] + mov ebx, -1 + cmova eax, ebx + cmova edi, ebx + cmova edx, ebx + cmova ecx, ebx + mov dword ptr [esi + 12], eax + mov dword ptr [esi + 8], ecx + mov dword ptr [esi + 4], edx + mov dword ptr [esi], edi + mov eax, esi + add esp, 32 + pop esi + pop edi + pop ebx + ret 4 diff --git a/generated assembly/x86_sse/f64_to_u16 b/generated assembly/x86_sse/f64_to_u16 new file mode 100644 index 0000000..64c292e --- /dev/null +++ b/generated assembly/x86_sse/f64_to_u16 @@ -0,0 +1,3 @@ +fast_float_to_integer::f64_to_u16: + cvttsd2si eax, qword ptr [esp + 4] + ret diff --git a/generated assembly/x86_sse/f64_to_u32 b/generated assembly/x86_sse/f64_to_u32 new file mode 100644 index 0000000..b91bf69 --- /dev/null +++ b/generated assembly/x86_sse/f64_to_u32 @@ -0,0 +1,15 @@ +fast_float_to_integer::f64_to_u32: + movsd xmm0, qword ptr [esp + 4] + call .L_0$pb +.L_0$pb: + pop eax +.L_1: + add eax, offset _GLOBAL_OFFSET_TABLE_+(.L_1-.L_0$pb) + cvttsd2si ecx, xmm0 + addsd xmm0, qword ptr [eax + .L_2@GOTOFF] + mov eax, ecx + sar eax, 31 + cvttsd2si edx, xmm0 + and eax, edx + or eax, ecx + ret diff --git a/generated assembly/x86_sse/f64_to_u64 b/generated assembly/x86_sse/f64_to_u64 new file mode 100644 index 0000000..e957366 --- /dev/null +++ b/generated assembly/x86_sse/f64_to_u64 @@ -0,0 +1,43 @@ +fast_float_to_integer::f64_to_u64: + push ebx + sub esp, 16 + call .L_0$pb +.L_0$pb: + pop ecx + movsd xmm0, qword ptr [esp + 24] + xorpd xmm1, xmm1 +.L_1: + add ecx, offset _GLOBAL_OFFSET_TABLE_+(.L_1-.L_0$pb) + movsd xmm2, qword ptr [ecx + .L_2@GOTOFF] + ucomisd xmm2, xmm0 + jbe .L_3 + xorpd xmm2, xmm2 +.L_3: + movapd xmm3, xmm0 + setbe bl + xor edx, edx + subsd xmm3, xmm2 + movsd qword ptr [esp + 8], xmm3 + fld qword ptr [esp + 8] + fnstcw word ptr [esp + 4] + movzx eax, word ptr [esp + 4] + or eax, 3072 + ucomisd xmm0, xmm1 + mov word ptr [esp + 6], ax + mov eax, 0 + fldcw word ptr [esp + 6] + fistp qword ptr [esp + 8] + fldcw word ptr [esp + 4] + jb .L_4 + movzx edx, bl + mov eax, dword ptr [esp + 8] + shl edx, 31 + xor edx, dword ptr [esp + 12] +.L_4: + ucomisd xmm0, qword ptr [ecx + .L_5@GOTOFF] + mov ecx, -1 + cmova edx, ecx + cmova eax, ecx + add esp, 16 + pop ebx + ret diff --git a/generated assembly/x86_sse/f64_to_u8 b/generated assembly/x86_sse/f64_to_u8 new file mode 100644 index 0000000..7b1814e --- /dev/null +++ b/generated assembly/x86_sse/f64_to_u8 @@ -0,0 +1,3 @@ +fast_float_to_integer::f64_to_u8: + cvttsd2si eax, qword ptr [esp + 4] + ret diff --git a/license b/license new file mode 100644 index 0000000..d0a1fa1 --- /dev/null +++ b/license @@ -0,0 +1,373 @@ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at https://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..6239d0d --- /dev/null +++ b/readme.md @@ -0,0 +1,39 @@ +Convert floating point values to integer types faster than the standard `as` operator. + +See the [library documentation](https://docs.rs/fast-float-to-integer) for documentation targeting users of the library. + +--- + +# Development + +We use the [xtask](https://github.com/matklad/cargo-xtask) pattern to implement automation tasks in Rust rather than shell scripts. This provides an easy way to compile for different targets and run the tests through qemu. + +CI enforces that all targets compile, pass tests, and that the generated assembly committed to the repository is up to date. + +# Releasing + +- Update the changelog. +- Update the version in Cargo.toml. +- Create a git tag for the version. + +# Improvements + +## More targets + +We should add common targets like aarch64. + +## AVX512 + +AVX512 can convert float to u64 in [one instruction](https://www.felixcloutier.com/x86/vcvttps2udq), but the intrinsics are [not stable](https://github.com/rust-lang/rust/issues/111137). + +We should make sure that AVX512 is actually faster in practice than the current approach. + +## Cross compilation + +The current cross compilation setup is brittle. It assume the host is x86 and that all the targets are x86 variants. This breaks for other architectures like aarch64 that need a custom linker. See the following links for more information: + +- https://rust-lang.github.io/rustup/cross-compilation.html +- https://github.com/japaric/rust-cross/blob/master/README.md#c-cross-toolchain +- https://github.com/cross-rs/cross + +We should improve this setup. Either setup the linking tools manually or use cargo cross. diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..3fa3a0f --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,166 @@ +//! Convert floating point values to integer types faster than the standard `as` operator. +//! +//! The standard way of converting floating point values to integers is with the [`as` operator](https://doc.rust-lang.org/reference/expressions/operator-expr.html#type-cast-expressions). This conversion has various guarantees as listed in the reference. One of them is that it saturates: Input values out of range of the output type convert to the minimal/maximal value of the output type. +//! +//! ``` +//! assert_eq!(300f32 as u8, 255); +//! assert_eq!(-5f32 as u8, 0); +//! ``` +//! +//! This contrasts C/C++, where this kind of cast is [undefined behavior](https://github.com/e00E/cpp-clamp-cast). Saturation comes with a downside. It is slower than the C/C++ version. On many [hardware targets](https://doc.rust-lang.org/nightly/rustc/platform-support.html) a float to integer conversion can be done in one instruction. For example [`CVTTSS2SI`](https://www.felixcloutier.com/x86/cvttss2si) on x86_84+SSE. Rust has to do more work than this, because the instruction does not provide saturation. +//! +//! Sometimes you want faster conversions and don't need saturation. This is what this crate provides. The behavior of the conversion functions in this crate depends on whether the input value is in range of the output type. If in range, then the conversion functions work like the standard `as` operator conversion. If not in range (including NaN), then you get an unspecified value. +//! +//! You never get undefined behavior but you can get unspecified behavior. In the unspecified case, you get an arbitrary value. The function returns and you get a valid value of the output type, but there is no guarantee what that value is. +//! +//! This crate picks an implementation automatically at compile time based on the [target](https://doc.rust-lang.org/reference/conditional-compilation.html#target_arch) and [features](https://doc.rust-lang.org/reference/attributes/codegen.html#the-target_feature-attribute). If there is no specialized implementation, then this crate picks the standard `as` operator conversion. This crate has optimized implementations on the following targets: +//! +//! - `target_arch = "x86_64", target_feature = "sse"`: all conversions except 128 bit integers +//! - `target_arch = "x86", target_feature = "sse"`: all conversions except 64 bit and 128 bit integers +//! +//! # Assembly comparison +//! +//! The [repository](https://github.com/e00E/fast-float-to-integer) contains generated assembly for every conversion and target. Here are some typical examples on x86_64+SSE. +//! +// +// We could do something like `#![doc = include_str!("../generated assembly/x86_64_default/f32_to_i64")]` to include the assembly directly. The downside of that is that compiling the library requires the assembly file to be there and we have to publish the file. +// +//! standard: +//! +//! ```asm +//! f32_to_i64: +//! cvttss2si rax, xmm0 +//! ucomiss xmm0, dword ptr [rip + .L_0] +//! movabs rcx, 9223372036854775807 +//! cmovbe rcx, rax +//! xor eax, eax +//! ucomiss xmm0, xmm0 +//! cmovnp rax, rcx +//! ret +//! ``` +//! +//! fast: +//! +//! ```asm +//! f32_to_i64: +//! cvttss2si rax, xmm0 +//! ret +//! ``` +//! +//! standard: +//! +//! ```asm +//! f32_to_u64: +//! cvttss2si rax, xmm0 +//! mov rcx, rax +//! sar rcx, 63 +//! movaps xmm1, xmm0 +//! subss xmm1, dword ptr [rip + .L_0] +//! cvttss2si rdx, xmm1 +//! and rdx, rcx +//! or rdx, rax +//! xor ecx, ecx +//! xorps xmm1, xmm1 +//! ucomiss xmm0, xmm1 +//! cmovae rcx, rdx +//! ucomiss xmm0, dword ptr [rip + .L_1] +//! mov rax, -1 +//! cmovbe rax, rcx +//! ret +//! ``` +//! +//! fast: +//! +//! ```asm +//! f32_to_u64: +//! cvttss2si rcx, xmm0 +//! addss xmm0, dword ptr [rip + .L_0] +//! cvttss2si rdx, xmm0 +//! mov rax, rcx +//! sar rax, 63 +//! and rax, rdx +//! or rax, rcx +//! ret +//! ``` + +#![cfg_attr(not(test), no_std)] + +/// Raise two to some power. +/// +/// This function exists because libcore does not provide the [`f32::powi]`] family of functions. +#[allow(dead_code)] +const fn power_of_two_f32(exponent: u32) -> f32 { + (2u128).pow(exponent) as f32 +} + +/// Like power_of_two_f32 but for f64. +#[allow(dead_code)] +const fn power_of_two_f64(exponent: u32) -> f64 { + (2u128).pow(exponent) as f64 +} + +macro_rules! create_target { + ($name:ident) => { + use $name as active_target; + + // Create a test with the target name so we can check that the expected target is active. The following command prints the active target through the test name: + // + // cargo test --quiet --package fast-float-to-integer --lib -- --list + #[test] + fn $name() {} + }; +} + +// Conditionally compiled target specific modules.The condition is set based on the availability of the intrinsics they use. This makes it safe to use the module. See the `default` module for the interface. +// +// We would put the mod declaration inside of the create_target macro too, but then rustfmt does not understand it. +cfg_if::cfg_if! { + if #[cfg(feature = "force-default")] { + mod target_default; + create_target!(target_default); + } else if #[cfg(all(target_arch = "x86_64", target_feature = "sse"))] { + mod target_x86_64_sse; + create_target!(target_x86_64_sse); + } else if #[cfg(all(target_arch = "x86", target_feature = "sse"))] { + mod target_x86_sse; + create_target!(target_x86_sse); + } else { + mod target_default; + create_target!(target_default); + } +} + +macro_rules! create_function { + ($name:ident, $Float:ty, $Integer:ty) => { + /// Convert the input floating point value to the output integer type. + /// + /// If the input value is out of range of the output type, then the result is unspecified. Otherwise, the result is the same as the standard `as` conversion. + #[cfg_attr(feature = "show-asm", inline(never))] + #[cfg_attr(not(feature = "show-asm"), inline(always))] + pub fn $name(float: $Float) -> $Integer { + active_target::implementation::$name(float) + } + }; +} + +create_function! {f32_to_i8, f32, i8} +create_function! {f32_to_u8, f32, u8} +create_function! {f32_to_i16, f32, i16} +create_function! {f32_to_u16, f32, u16} +create_function! {f32_to_i32, f32, i32} +create_function! {f32_to_u32, f32, u32} +create_function! {f32_to_i64, f32, i64} +create_function! {f32_to_u64, f32, u64} +create_function! {f32_to_i128, f32, i128} +create_function! {f32_to_u128, f32, u128} + +create_function! {f64_to_i8, f64, i8} +create_function! {f64_to_u8, f64, u8} +create_function! {f64_to_i16, f64, i16} +create_function! {f64_to_u16, f64, u16} +create_function! {f64_to_i32, f64, i32} +create_function! {f64_to_u32, f64, u32} +create_function! {f64_to_i64, f64, i64} +create_function! {f64_to_u64, f64, u64} +create_function! {f64_to_i128, f64, i128} +create_function! {f64_to_u128, f64, u128} diff --git a/src/target_default.rs b/src/target_default.rs new file mode 100644 index 0000000..792fa71 --- /dev/null +++ b/src/target_default.rs @@ -0,0 +1,34 @@ +// There is an inner module to separate the implementation from the interface. + +macro_rules! create_function { + ($name:ident, $Input:ty, $Output: ty) => { + #[inline(always)] + pub fn $name(float: $Input) -> $Output { + float as _ + } + }; +} + +pub mod implementation { + create_function! {f32_to_i8, f32, i8} + create_function! {f32_to_u8, f32, u8} + create_function! {f32_to_i16, f32, i16} + create_function! {f32_to_u16, f32, u16} + create_function! {f32_to_i32, f32, i32} + create_function! {f32_to_u32, f32, u32} + create_function! {f32_to_i64, f32, i64} + create_function! {f32_to_u64, f32, u64} + create_function! {f32_to_i128, f32, i128} + create_function! {f32_to_u128, f32, u128} + + create_function! {f64_to_i8, f64, i8} + create_function! {f64_to_u8, f64, u8} + create_function! {f64_to_i16, f64, i16} + create_function! {f64_to_u16, f64, u16} + create_function! {f64_to_i32, f64, i32} + create_function! {f64_to_u32, f64, u32} + create_function! {f64_to_i64, f64, i64} + create_function! {f64_to_u64, f64, u64} + create_function! {f64_to_i128, f64, i128} + create_function! {f64_to_u128, f64, u128} +} diff --git a/src/target_x86_64_sse.rs b/src/target_x86_64_sse.rs new file mode 100644 index 0000000..d266c7d --- /dev/null +++ b/src/target_x86_64_sse.rs @@ -0,0 +1,188 @@ +use core::arch::x86_64::{_mm_cvttsd_si64, _mm_cvttss_si64, _mm_loadu_pd, _mm_loadu_ps}; + +use crate::{power_of_two_f32, power_of_two_f64}; + +/// Convert f32 to i64 using the CVTTSS2SI instruction. If the input f32 is out of range of the output i64, then the result is i64::MIN. +#[inline(always)] +fn f32_to_i64(float: f32) -> i64 { + // The compiler optimizes this function into a single instruction without the need for inline assembly. + + let floats = [float, 0., 0., 0.]; + let floats_pointer = floats.as_ptr(); + let floats_register = unsafe { _mm_loadu_ps(floats_pointer) }; + unsafe { _mm_cvttss_si64(floats_register) } +} + +// For f32_to_i32 we could use CVTTSS2SI with 32 bit output (_mm_cvttss_si64) instead of the 64 bit output. That might be faster. + +// We can't use the same approach for u64 output because the conversion instruction only works on i64. This is a problem for floats that exceed i64::MAX. We cannot handle this with one instruction, but we can still do better than the as operator. + +// This approach branches into a special case if the input is too large. The branchless approach below is faster and is the one we use. We keep this code around for documentation purposes. +#[inline(always)] +fn _f32_to_u64_branchful(float: f32) -> u64 { + const THRESHOLD_FLOAT: f32 = power_of_two_f32(63); + const THRESHOLD_INTEGER: u64 = 2u64.pow(63); + + let in_range = float <= THRESHOLD_FLOAT; + if in_range { + f32_to_i64(float) as u64 + } else { + // Subtract the threshold from the float. The result is >= 0 because the input is larger than the subtrahend. The result is <= i64::MAX because `u64::MAX - i64::MAX == i64::MAX`. + let in_range_float = float - THRESHOLD_FLOAT; + let integer = f32_to_i64(in_range_float) as u64; + // Overflow is benign because it can only occur for invalid inputs. + integer.overflowing_add(THRESHOLD_INTEGER).0 + } +} + +// This approach avoids the branch. It is faster than the branchful approach. +#[inline(always)] +fn f32_to_u64_branchless(float: f32) -> u64 { + const THRESHOLD: f32 = power_of_two_f32(63); + + let integer1 = f32_to_i64(float); + let integer2 = f32_to_i64(float - THRESHOLD); + // If the input is larger than i64::MAX, then integer1 is i64::MIN. This value has 1 as the leftmost bit and 0 as the remaining bits. Right shift on signed values is arithmetic, not logical [1]. We end up with all 0 (in range) or all 1 (out of range). + let too_large = integer1 >> 63; + // # If the input is not too large: + // + // Integer1 has the correct value. The mask is all 0, which makes the Or result in integer1. + // + // # If the input is too large: + // + // Integer1 is i64::MIN and the mask is all 1. The Or results in `i64::MIN | integer2`. integer2 has the correct result minus 2**63. This is the correct result without the leftmost bit. The Or adds the missing leftmost bit back. + (integer1 | (integer2 & too_large)) as u64 + + // [1] https://doc.rust-lang.org/reference/expressions/operator-expr.html#arithmetic-and-logical-binary-operators +} + +#[inline(always)] +fn f32_to_u64(float: f32) -> u64 { + f32_to_u64_branchless(float) +} + +// Repeat for f64. + +#[inline(always)] +fn f64_to_i64(float: f64) -> i64 { + // see convert_f32 + + let floats = [float, 0.]; + let floats_pointer = floats.as_ptr(); + let floats_register = unsafe { _mm_loadu_pd(floats_pointer) }; + unsafe { _mm_cvttsd_si64(floats_register) } +} + +#[inline(always)] +fn f64_to_u64(float: f64) -> u64 { + // see f32_to_u64 + + const THRESHOLD: f64 = power_of_two_f64(63); + + let integer1 = f64_to_i64(float); + let integer2 = f64_to_i64(float - THRESHOLD); + let too_large = integer1 >> 63; + (integer1 | (integer2 & too_large)) as u64 +} + +pub mod implementation { + #[inline(always)] + pub fn f32_to_i8(float: f32) -> i8 { + super::f32_to_i64(float) as _ + } + + #[inline(always)] + pub fn f32_to_u8(float: f32) -> u8 { + super::f32_to_i64(float) as _ + } + + #[inline(always)] + pub fn f32_to_i16(float: f32) -> i16 { + super::f32_to_i64(float) as _ + } + + #[inline(always)] + pub fn f32_to_u16(float: f32) -> u16 { + super::f32_to_i64(float) as _ + } + + #[inline(always)] + pub fn f32_to_i32(float: f32) -> i32 { + super::f32_to_i64(float) as _ + } + + #[inline(always)] + pub fn f32_to_u32(float: f32) -> u32 { + super::f32_to_i64(float) as _ + } + + #[inline(always)] + pub fn f32_to_i64(float: f32) -> i64 { + super::f32_to_i64(float) as _ + } + + #[inline(always)] + pub fn f32_to_u64(float: f32) -> u64 { + super::f32_to_u64(float) as _ + } + + #[inline(always)] + pub fn f32_to_i128(float: f32) -> i128 { + float as _ + } + + #[inline(always)] + pub fn f32_to_u128(float: f32) -> u128 { + float as _ + } + + #[inline(always)] + pub fn f64_to_i8(float: f64) -> i8 { + f64_to_i64(float) as _ + } + + #[inline(always)] + pub fn f64_to_u8(float: f64) -> u8 { + super::f64_to_i64(float) as _ + } + + #[inline(always)] + pub fn f64_to_i16(float: f64) -> i16 { + super::f64_to_i64(float) as _ + } + + #[inline(always)] + pub fn f64_to_u16(float: f64) -> u16 { + super::f64_to_i64(float) as _ + } + + #[inline(always)] + pub fn f64_to_i32(float: f64) -> i32 { + super::f64_to_i64(float) as _ + } + + #[inline(always)] + pub fn f64_to_u32(float: f64) -> u32 { + super::f64_to_i64(float) as _ + } + + #[inline(always)] + pub fn f64_to_i64(float: f64) -> i64 { + super::f64_to_i64(float) as _ + } + + #[inline(always)] + pub fn f64_to_u64(float: f64) -> u64 { + super::f64_to_u64(float) as _ + } + + #[inline(always)] + pub fn f64_to_i128(float: f64) -> i128 { + float as _ + } + + #[inline(always)] + pub fn f64_to_u128(float: f64) -> u128 { + float as _ + } +} diff --git a/src/target_x86_sse.rs b/src/target_x86_sse.rs new file mode 100644 index 0000000..2c842de --- /dev/null +++ b/src/target_x86_sse.rs @@ -0,0 +1,149 @@ +use core::arch::x86::{_mm_cvttsd_si32, _mm_cvttss_si32, _mm_loadu_pd, _mm_loadu_ps}; + +use crate::{power_of_two_f32, power_of_two_f64}; + +#[inline(always)] +fn f32_to_i32(float: f32) -> i32 { + // see crate::x86_64_sse::f32_to_i64 + + let floats = [float, 0., 0., 0.]; + let floats_pointer = floats.as_ptr(); + let floats_register = unsafe { _mm_loadu_ps(floats_pointer) }; + unsafe { _mm_cvttss_si32(floats_register) } +} + +#[inline(always)] +fn f32_to_u32(float: f32) -> u32 { + // see crate::x86_64_sse::f32_to_u64 + + const THRESHOLD: f32 = power_of_two_f32(31); + + let integer1 = f32_to_i32(float); + let integer2 = f32_to_i32(float - THRESHOLD); + let too_large = integer1 >> 31; + (integer1 | (integer2 & too_large)) as u32 +} + +#[inline(always)] +fn f64_to_i32(float: f64) -> i32 { + // see crate::x86_64_sse::f64_to_i64 + + let floats = [float, 0.]; + let floats_pointer = floats.as_ptr(); + let floats_register = unsafe { _mm_loadu_pd(floats_pointer) }; + unsafe { _mm_cvttsd_si32(floats_register) } +} + +#[inline(always)] +fn f64_to_u32(float: f64) -> u32 { + // see crate::x86_64_sse::f64_to_u64 + + const THRESHOLD: f64 = power_of_two_f64(31); + + let integer1 = f64_to_i32(float); + let integer2 = f64_to_i32(float - THRESHOLD); + let too_large = integer1 >> 31; + (integer1 | (integer2 & too_large)) as u32 +} + +pub mod implementation { + #[inline(always)] + pub fn f32_to_i8(float: f32) -> i8 { + super::f32_to_i32(float) as _ + } + + #[inline(always)] + pub fn f32_to_u8(float: f32) -> u8 { + super::f32_to_i32(float) as _ + } + + #[inline(always)] + pub fn f32_to_i16(float: f32) -> i16 { + super::f32_to_i32(float) as _ + } + + #[inline(always)] + pub fn f32_to_u16(float: f32) -> u16 { + super::f32_to_i32(float) as _ + } + + #[inline(always)] + pub fn f32_to_i32(float: f32) -> i32 { + super::f32_to_i32(float) as _ + } + + #[inline(always)] + pub fn f32_to_u32(float: f32) -> u32 { + super::f32_to_u32(float) as _ + } + + #[inline(always)] + pub fn f32_to_i64(float: f32) -> i64 { + float as _ + } + + #[inline(always)] + pub fn f32_to_u64(float: f32) -> u64 { + float as _ + } + + #[inline(always)] + pub fn f32_to_i128(float: f32) -> i128 { + float as _ + } + + #[inline(always)] + pub fn f32_to_u128(float: f32) -> u128 { + float as _ + } + + #[inline(always)] + pub fn f64_to_i8(float: f64) -> i8 { + super::f64_to_i32(float) as _ + } + + #[inline(always)] + pub fn f64_to_u8(float: f64) -> u8 { + super::f64_to_i32(float) as _ + } + + #[inline(always)] + pub fn f64_to_i16(float: f64) -> i16 { + super::f64_to_i32(float) as _ + } + + #[inline(always)] + pub fn f64_to_u16(float: f64) -> u16 { + super::f64_to_i32(float) as _ + } + + #[inline(always)] + pub fn f64_to_i32(float: f64) -> i32 { + super::f64_to_i32(float) as _ + } + + #[inline(always)] + pub fn f64_to_u32(float: f64) -> u32 { + super::f64_to_u32(float) as _ + } + + #[inline(always)] + pub fn f64_to_i64(float: f64) -> i64 { + float as _ + } + + #[inline(always)] + pub fn f64_to_u64(float: f64) -> u64 { + float as _ + } + + #[inline(always)] + pub fn f64_to_i128(float: f64) -> i128 { + float as _ + } + + #[inline(always)] + pub fn f64_to_u128(float: f64) -> u128 { + float as _ + } +} diff --git a/tests/test.rs b/tests/test.rs new file mode 100644 index 0000000..6787806 --- /dev/null +++ b/tests/test.rs @@ -0,0 +1,162 @@ +// TODO: Consider rewriting this with traits instead of macros. + +use float_next_after::NextAfter; + +trait InRange { + /// Is this float value in range for this integer type? + #[allow(clippy::wrong_self_convention)] + fn in_range(self) -> bool; +} + +macro_rules! implement_is_valid { + ($Float:ty, $Integer:ty, $signed:expr) => { + impl InRange<$Integer> for $Float { + fn in_range(self) -> bool { + let bits = <$Integer>::BITS as i32; + let base: $Float = 2.; + if $signed { + self >= -base.powi(bits - 1) && self < base.powi(bits - 1) + } else { + self >= 0. && self < base.powi(bits) + } + } + } + }; +} + +implement_is_valid! {f32, i8, true} +implement_is_valid! {f32, u8, false} +implement_is_valid! {f32, i16, true} +implement_is_valid! {f32, u16, false} +implement_is_valid! {f32, i32, true} +implement_is_valid! {f32, u32, false} +implement_is_valid! {f32, i64, true} +implement_is_valid! {f32, u64, false} +implement_is_valid! {f32, i128, true} +implement_is_valid! {f32, u128, false} + +implement_is_valid! {f64, i8, true} +implement_is_valid! {f64, u8, false} +implement_is_valid! {f64, i16, true} +implement_is_valid! {f64, u16, false} +implement_is_valid! {f64, i32, true} +implement_is_valid! {f64, u32, false} +implement_is_valid! {f64, i64, true} +implement_is_valid! {f64, u64, false} +implement_is_valid! {f64, i128, true} +implement_is_valid! {f64, u128, false} + +// We can test all f32 values in 10 seconds on a modern processor. On qemu it is too slow. + +macro_rules! create_all_f32_test { + ($name:ident, $convert_custom:path, $Integer:ty) => { + #[test] + #[ignore] + fn $name() { + for i in u32::MIN..=u32::MAX { + let float = f32::from_bits(i); + let result = $convert_custom(float); + let expected = float as $Integer; + // We skip the assert but not the computation. This detects failing debug assertions in the implementation. + if InRange::<$Integer>::in_range(float) { + assert_eq!(result, expected, "{float:.0}"); + } + } + } + }; +} + +create_all_f32_test! {all_f32_i8, fast_float_to_integer::f32_to_i8, i8} +create_all_f32_test! {all_f32_u8, fast_float_to_integer::f32_to_u8, u8} +create_all_f32_test! {all_f32_i16, fast_float_to_integer::f32_to_i16, i16} +create_all_f32_test! {all_f32_u16, fast_float_to_integer::f32_to_u16, u16} +create_all_f32_test! {all_f32_i32, fast_float_to_integer::f32_to_i32, i32} +create_all_f32_test! {all_f32_u32, fast_float_to_integer::f32_to_u32, u32} +create_all_f32_test! {all_f32_i64, fast_float_to_integer::f32_to_i64, i64} +create_all_f32_test! {all_f32_u64, fast_float_to_integer::f32_to_u64, u64} +create_all_f32_test! {all_f32_i128, fast_float_to_integer::f32_to_i128, i128} +create_all_f32_test! {all_f32_u128, fast_float_to_integer::f32_to_u128, u128} + +macro_rules! create_interesting_floats_function { + ($name:ident, $Float:ty) => { + fn $name() -> impl Iterator { + let signs = |float: $Float| [float, -float]; + + let neighbors = |float: $Float| { + [ + float.next_after(<$Float>::INFINITY), + float + .next_after(<$Float>::INFINITY) + .next_after(<$Float>::INFINITY), + float.next_after(<$Float>::NEG_INFINITY), + float + .next_after(<$Float>::NEG_INFINITY) + .next_after(<$Float>::NEG_INFINITY), + ] + }; + + let offsets = |float: $Float| [-2, -1, 0, 1, 2].map(|offset| float + offset as $Float); + + let exponents = 0..70; + exponents.flat_map(move |exponent| { + let float = (2 as $Float).powi(exponent); + offsets(float) + .into_iter() + .chain(neighbors(float)) + .chain([float * 1.5]) + .flat_map(signs) + }) + } + }; +} + +create_interesting_floats_function! {interesting_floats_f32, f32} +create_interesting_floats_function! {interesting_floats_f64, f64} + +#[test] +#[ignore] +fn print_interesting_floats() { + for float in interesting_floats_f32() { + println!("{float:.e} {float:.0} {:.x}", float.to_bits()); + } +} + +macro_rules! create_interesting_floats_test { + ($name:ident, $interesting_floats_function:ident, $convert_custom:path, $Integer:ty) => { + #[test] + fn $name() { + let mut valid_count: u32 = 0; + for float in $interesting_floats_function() { + let result = $convert_custom(float); + let expected = float as $Integer; + if InRange::<$Integer>::in_range(float) { + valid_count += 1; + assert_eq!(result, expected, "{float:.0}"); + } + } + assert!((50..2000).contains(&valid_count), "{valid_count}"); + } + }; +} + +create_interesting_floats_test! {interesting_f32_i8, interesting_floats_f32, fast_float_to_integer::f32_to_i8, i8} +create_interesting_floats_test! {interesting_f32_u8, interesting_floats_f32, fast_float_to_integer::f32_to_u8, u8} +create_interesting_floats_test! {interesting_f32_i16, interesting_floats_f32, fast_float_to_integer::f32_to_i16, i16} +create_interesting_floats_test! {interesting_f32_u16, interesting_floats_f32, fast_float_to_integer::f32_to_u16, u16} +create_interesting_floats_test! {interesting_f32_i32, interesting_floats_f32, fast_float_to_integer::f32_to_i32, i32} +create_interesting_floats_test! {interesting_f32_u32, interesting_floats_f32, fast_float_to_integer::f32_to_u32, u32} +create_interesting_floats_test! {interesting_f32_i64, interesting_floats_f32, fast_float_to_integer::f32_to_i64, i64} +create_interesting_floats_test! {interesting_f32_u64, interesting_floats_f32, fast_float_to_integer::f32_to_u64, u64} +create_interesting_floats_test! {interesting_f32_i128, interesting_floats_f32, fast_float_to_integer::f32_to_i128, i128} +create_interesting_floats_test! {interesting_f32_u128, interesting_floats_f32, fast_float_to_integer::f32_to_u128, u128} + +create_interesting_floats_test! {interesting_f64_i8, interesting_floats_f64, fast_float_to_integer::f64_to_i8, i8} +create_interesting_floats_test! {interesting_f64_u8, interesting_floats_f64, fast_float_to_integer::f64_to_u8, u8} +create_interesting_floats_test! {interesting_f64_i16, interesting_floats_f64, fast_float_to_integer::f64_to_i16, i16} +create_interesting_floats_test! {interesting_f64_u16, interesting_floats_f64, fast_float_to_integer::f64_to_u16, u16} +create_interesting_floats_test! {interesting_f64_i32, interesting_floats_f64, fast_float_to_integer::f64_to_i32, i32} +create_interesting_floats_test! {interesting_f64_u32, interesting_floats_f64, fast_float_to_integer::f64_to_u32, u32} +create_interesting_floats_test! {interesting_f64_i64, interesting_floats_f64, fast_float_to_integer::f64_to_i64, i64} +create_interesting_floats_test! {interesting_f64_u64, interesting_floats_f64, fast_float_to_integer::f64_to_u64, u64} +create_interesting_floats_test! {interesting_f64_i128, interesting_floats_f64, fast_float_to_integer::f64_to_i128, i128} +create_interesting_floats_test! {interesting_f64_u128, interesting_floats_f64, fast_float_to_integer::f64_to_u128, u128} diff --git a/xtask/Cargo.toml b/xtask/Cargo.toml new file mode 100644 index 0000000..0555b03 --- /dev/null +++ b/xtask/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "xtask" +version = "0.1.0" +edition = "2021" +publish = false + +[dependencies] +anyhow = "1.0" +regex = { version = "1.11.1", default-features = false } diff --git a/xtask/src/main.rs b/xtask/src/main.rs new file mode 100644 index 0000000..a6e0bcc --- /dev/null +++ b/xtask/src/main.rs @@ -0,0 +1,374 @@ +use std::{ + borrow::Cow, + collections::{hash_map::Entry, HashMap}, + fmt::Write, + path::PathBuf, + process::{Command, Output}, + sync::LazyLock, +}; + +use anyhow::{anyhow, Context, Result}; +use regex::Regex; + +fn main() -> Result<()> { + let command = std::env::args() + .nth(1) + .context("missing command argument")?; + let command = match command.as_str() { + "check" => |target: &Target| check(target, false), + "clippy" => |target: &Target| check(target, true), + "target" => |target: &Target| expected_target(target), + "test" => |target: &Target| qemu_test(target), + "asm" => |target: &Target| show_asm(target), + "all" => |target: &Target| { + check(target, true).context("check")?; + expected_target(target).context("target")?; + qemu_test(target).context("test")?; + show_asm(target).context("asm")?; + Ok(()) + }, + _ => return Err(anyhow!("unknown command")), + }; + + for target in TARGETS { + println!("Handling target {}.", target.name); + install_rustup_target(target.rust_target).context("install rustup target")?; + command(target)?; + } + + Ok(()) +} + +struct Target { + name: &'static str, + rust_target: &'static str, + expected_target_module: &'static str, + feature: &'static str, + qemu: &'static str, + generate_assembly: bool, + force_default: bool, +} + +const TARGETS: &[Target] = &[ + Target { + name: "x86_64_sse", + rust_target: "x86_64-unknown-linux-gnu", + expected_target_module: "x86_64_sse", + feature: "+sse", + qemu: "x86_64", + generate_assembly: true, + force_default: false, + }, + Target { + name: "x86_64_default", + rust_target: "x86_64-unknown-linux-gnu", + expected_target_module: "default", + feature: "", + qemu: "x86_64", + generate_assembly: true, + force_default: true, + }, + Target { + name: "x86_sse", + rust_target: "i686-unknown-linux-gnu", + expected_target_module: "x86_sse", + feature: "+sse", + qemu: "i386", + generate_assembly: true, + force_default: false, + }, + Target { + name: "default", + rust_target: "i686-unknown-linux-gnu", + expected_target_module: "default", + feature: "-sse", + qemu: "i386", + generate_assembly: false, + force_default: false, + }, +]; + +/// Convert a Command to a string representation you can paste in your terminal. +/// +/// Assumes that the command does not run into tricky formatting edge cases with characters that need to be escaped. +fn command_to_string(command: &Command) -> String { + fn string_is_not_tricky(string: &str) -> bool { + string.chars().all(|char| { + char.is_ascii_alphanumeric() || ['-', '_', '=', '/', '.', '+', ' '].contains(&char) + }) + } + + fn handle_space(s: &str) -> Cow { + if s.contains(' ') { + format!("\"{s}\"").into() + } else { + s.into() + } + } + + let mut string = String::new(); + + let envs = command.get_envs(); + let has_envs = envs.len() > 0; + if has_envs { + write!(&mut string, "env").unwrap(); + } + for (key, value) in envs { + let key = key.to_str().unwrap(); + let value = value.unwrap_or_default().to_str().unwrap(); + assert!(string_is_not_tricky(key), "{key:?}"); + assert!(string_is_not_tricky(value), "{value:?}"); + let key = handle_space(key); + let value = handle_space(value); + write!(&mut string, " {key}={value}").unwrap(); + } + if has_envs { + write!(&mut string, " ").unwrap(); + } + + let program = command.get_program().to_str().unwrap(); + assert!(string_is_not_tricky(program), "{program:?}"); + let program = handle_space(program); + write!(&mut string, "{program}").unwrap(); + + for arg in command.get_args() { + let arg = arg.to_str().unwrap(); + assert!(string_is_not_tricky(arg), "{arg:?}"); + let arg = handle_space(arg); + write!(&mut string, " {}", arg).unwrap(); + } + + string +} + +/// Run a command while checking status code and providing a better error message. +fn run_command(command: &mut Command) -> Result { + let make_string = |command: &Command| format!("command: {}", command_to_string(command)); + let output = command + .output() + .context("command failed") + .with_context(|| make_string(command))?; + if !output.status.success() { + let stdout = String::from_utf8_lossy(output.stdout.as_slice()); + let stderr = String::from_utf8_lossy(output.stderr.as_slice()); + return Err(anyhow!("command status indicates error") + .context(format!("command: {}", make_string(command))) + .context(format!("stdout: {stdout}")) + .context(format!("stderr: {stderr}"))); + } + Ok(output) +} + +fn install_rustup_target(target: &str) -> Result<()> { + run_command(Command::new("rustup").args(["--quiet", "target", "add", target]))?; + Ok(()) +} + +fn cargo_with_target( + Target { + rust_target: target, + feature, + .. + }: &Target, + subcommand: &str, + rustflags: &[&str], +) -> Command { + let mut flags = String::new(); + write!(&mut flags, "-Ctarget-feature={feature}").unwrap(); + for flag in rustflags { + write!(&mut flags, " {flag}").unwrap(); + } + let target_arg = format!("--target={target}"); + let mut command = Command::new("cargo"); + command + .env("RUSTFLAGS", flags.as_str()) + .args([subcommand, target_arg.as_str()]); + command +} + +fn check(target: &Target, clippy: bool) -> Result<()> { + let command = match clippy { + true => "clippy", + false => "check", + }; + let features = if target.force_default { + "--features=force-default" + } else { + "--features=" + }; + let mut command = cargo_with_target(target, command, &[]); + command.args([ + "--quiet", + "--frozen", + "--package=fast-float-to-integer", + "--all-targets", + features, + ]); + if clippy { + command.args(["--", "-D=warnings"]); + } + run_command(&mut command)?; + Ok(()) +} + +fn show_asm(target: &Target) -> Result<()> { + if !target.generate_assembly { + return Ok(()); + } + + let functions = [ + "f32_to_i8", + "f32_to_u8", + "f32_to_i16", + "f32_to_u16", + "f32_to_i32", + "f32_to_u32", + "f32_to_i64", + "f32_to_u64", + "f32_to_i128", + "f32_to_u128", + "f64_to_i8", + "f64_to_u8", + "f64_to_i16", + "f64_to_u16", + "f64_to_i32", + "f64_to_u32", + "f64_to_i64", + "f64_to_u64", + "f64_to_i128", + "f64_to_u128", + ]; + + let mut features = "--features=show-asm".to_owned(); + if target.force_default { + features.push_str(",force-default"); + } + + for function in functions { + let output = run_command(cargo_with_target(target, "asm", &[]).args([ + // "--quiet", // will be supported in next cargo asm release + "--no-color", + "--simplify", + "--include-constants", + "--package=fast-float-to-integer", + "--lib", + features.as_str(), + "--profile=show-asm", + function, + ]))?; + let output = std::str::from_utf8(output.stdout.as_slice()).unwrap(); + let output = normalize_assembly(output); + + let mut path = PathBuf::new(); + path.push("generated assembly"); + path.push(target.name); + std::fs::create_dir_all(&path).context("create_dir_all")?; + path.push(function); + std::fs::write(&path, output.as_ref()).context("write generated assembly")?; + } + + Ok(()) +} + +/// We diff the generated assembly to make sure it doesn't accidentally change. This requires the assembly to be deterministic. By default, some parts of the assembly like labels are not deterministic. This function fixes that. +fn normalize_assembly(assembly: &str) -> Cow { + const REGEX: &str = r"\.L([[:alnum:]]|_)+"; + static RE: LazyLock = LazyLock::new(|| Regex::new(REGEX).unwrap()); + + let mut matches = RE.find_iter(assembly).peekable(); + if matches.peek().is_none() { + return Cow::Borrowed(assembly); + } + + let mut result = String::new(); + let mut labels = HashMap::<&str, usize>::new(); + let mut previous_match_end = 0usize; + for label in matches { + let mut label_index = labels.len(); + label_index = match labels.entry(label.as_str()) { + Entry::Occupied(entry) => *entry.get(), + Entry::Vacant(entry) => *entry.insert(label_index), + }; + + let range = label.range(); + result.push_str(&assembly[previous_match_end..range.start]); + write!(&mut result, ".L_{label_index}").unwrap(); + previous_match_end = range.end; + } + result.push_str(&assembly[previous_match_end..]); + Cow::Owned(result) +} + +#[test] +fn normalize_assembly_() { + let input = "abcd"; + let expected = "abcd"; + let actual = normalize_assembly(input); + assert_eq!(actual, expected); + + let input = "a .LCPI2_0 b .LCPI3_0 c .LCPI2_0 d"; + let expected = "a .L_0 b .L_1 c .L_0 d"; + let actual = normalize_assembly(input); + assert_eq!(actual, expected); +} + +fn qemu_test(target: &Target) -> Result<()> { + let features = if target.force_default { + "--features=force-default" + } else { + "--features=" + }; + let output = run_command(cargo_with_target(target, "test", &[]).args([ + "--frozen", + "--no-run", + "--package=fast-float-to-integer", + "--test=test", + features, + ]))?; + let stderr = std::str::from_utf8(output.stderr.as_slice()).context("output is not utf8")?; + + let test_binary_path = stderr + .rsplit('\n') + .nth(1) + .context("unexpected output")? + .strip_prefix(" Executable tests/test.rs (") + .context("unexpected output")? + .strip_suffix(')') + .context("unexpected output")?; + + run_command( + Command::new(format!("qemu-{}", target.qemu)).args([test_binary_path, "--test-threads=1"]), + )?; + + Ok(()) +} + +fn expected_target(target: &Target) -> Result<()> { + let features = if target.force_default { + "--features=force-default" + } else { + "--features=" + }; + let output = run_command(cargo_with_target(target, "test", &[]).args([ + "--quiet", + "--package=fast-float-to-integer", + features, + "--lib", + "--", + "--list", + ]))?; + let actual = std::str::from_utf8(output.stdout.as_slice()) + .context("output is not utf8")? + .strip_prefix("target_") + .context("unexpected stdout")? + .strip_suffix(": test\n") + .context("unexpected stdout")?; + if actual != target.expected_target_module { + return Err(anyhow!( + "actual target {} does not match expected target {}", + actual, + target.expected_target_module, + )); + } + Ok(()) +}