Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

workaround cdt's small const memcpy host function calls in EOS VM OC #1016

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,7 @@ using intrinsic_map_t = std::map<std::string, intrinsic_entry>;

const intrinsic_map_t& get_intrinsic_map();

static constexpr unsigned minimum_const_memcpy_intrinsic_to_optimize = 1;
static constexpr unsigned maximum_const_memcpy_intrinsic_to_optimize = 128;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

128 is a bit of a dice roll. I don't have data to know where the exact threshold is between this local copy being faster vs slower than the full call off to memcpy. I wouldn't want to go higher than this anyways though, since it bloats the code a little more.


Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a weird place for this, but header files for OC are a little weird because of some parts still are compiled with C++17 -- need to place these in a header file that is consumed by the C++17 code.

}}}
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,8 @@ inline constexpr auto get_intrinsic_table() {
"env.bls_fp_mod",
"env.bls_fp_mul",
"env.bls_fp_exp",
"env.set_finalizers"
"env.set_finalizers",
"eosvmoc_internal.check_memcpy_params"
);
}
inline constexpr std::size_t find_intrinsic_index(std::string_view hf) {
Expand Down
24 changes: 24 additions & 0 deletions libraries/chain/webassembly/runtimes/eos-vm-oc/LLVMEmitIR.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -697,12 +697,14 @@ namespace LLVMJIT
llvm::Value* callee;
const FunctionType* calleeType;
bool isExit = false;
bool isMemcpy = false;
if(imm.functionIndex < moduleContext.importedFunctionOffsets.size())
{
calleeType = module.types[module.functions.imports[imm.functionIndex].type.index];
llvm::Value* ic = irBuilder.CreateLoad( emitLiteralPointer((void*)(OFFSET_OF_FIRST_INTRINSIC-moduleContext.importedFunctionOffsets[imm.functionIndex]*8), llvmI64Type->getPointerTo(256)) );
callee = irBuilder.CreateIntToPtr(ic, asLLVMType(calleeType)->getPointerTo());
isExit = module.functions.imports[imm.functionIndex].moduleName == "env" && module.functions.imports[imm.functionIndex].exportName == "eosio_exit";
isMemcpy = module.functions.imports[imm.functionIndex].moduleName == "env" && module.functions.imports[imm.functionIndex].exportName == "memcpy";
}
else
{
Expand All @@ -715,6 +717,28 @@ namespace LLVMJIT
auto llvmArgs = (llvm::Value**)alloca(sizeof(llvm::Value*) * calleeType->parameters.size());
popMultiple(llvmArgs,calleeType->parameters.size());

//convert small constant sized memcpy host function calls to a load+store (plus small call to validate non-overlap rule)
if(isMemcpy) {
assert(calleeType->parameters.size() == 3);
if(llvm::ConstantInt* const_memcpy_sz = llvm::dyn_cast<llvm::ConstantInt>(llvmArgs[2]);
const_memcpy_sz &&
const_memcpy_sz->getZExtValue() >= minimum_const_memcpy_intrinsic_to_optimize &&
const_memcpy_sz->getZExtValue() <= maximum_const_memcpy_intrinsic_to_optimize) {
const unsigned sz_value = const_memcpy_sz->getZExtValue();
llvm::IntegerType* type_of_memcpy_width = llvm::Type::getIntNTy(context, sz_value*8);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The use of a non-power-of-2 type size here is rather unorthodox, though one would have to imagine something like _BitInt from C23 would do exactly this. Unit tests exhaustively explore every size (1 through 128). An oddball size does generate mostly expected machine code (loads and stores are maybe not in "natural" order one would expect but that doesn't matter of course), for example for size 13:

		                pushq	%rax
		                decl	%gs:-18888
		                je	83
		                movl	%gs:208, %eax  ;;load 4 bytes (208-211)
		                movq	%gs:200, %rcx  ;;load 8 bytes (200-207)
		                movb	%gs:212, %dl   ;;load 1 byte  (212)
		                movb	%dl, %gs:16    ;;store 1 byte  (16)
		                movq	%rcx, %gs:4    ;;store 8 bytes (4-11)
		                movl	%eax, %gs:12   ;;store 4 bytes (12-15)
		                movl	$4, %edi
		                movl	$200, %esi
		                movl	$13, %edx
		                callq	*%gs:-21056
		                incl	%gs:-18888
		                popq	%rax
		                retq
		                callq	*%gs:-18992


llvm::Value* load_pointer = coerceByteIndexToPointer(llvmArgs[1],0,type_of_memcpy_width);
llvm::Value* store_pointer = coerceByteIndexToPointer(llvmArgs[0],0,type_of_memcpy_width);
irBuilder.CreateStore(irBuilder.CreateLoad(load_pointer), store_pointer, true);

emitRuntimeIntrinsic("eosvmoc_internal.check_memcpy_params",
FunctionType::get(ResultType::none,{ValueType::i32,ValueType::i32,ValueType::i32}),
{llvmArgs[0],llvmArgs[1],llvmArgs[2]});
push(llvmArgs[0]);
return;
}
}

// Call the function.
auto result = createCall(callee,llvm::ArrayRef<llvm::Value*>(llvmArgs,calleeType->parameters.size()));
if(isExit) {
Expand Down
32 changes: 25 additions & 7 deletions libraries/chain/webassembly/runtimes/eos-vm-oc/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,9 @@ static intrinsic eosio_exit_intrinsic("env.eosio_exit", IR::FunctionType::get(IR
std::integral_constant<std::size_t, find_intrinsic_index("env.eosio_exit")>::value
);

static void throw_internal_exception(const char* const s) {
*reinterpret_cast<std::exception_ptr*>(eos_vm_oc_get_exception_ptr()) = std::make_exception_ptr(wasm_execution_error(FC_LOG_MESSAGE(error, s)));
template <typename E>
static void throw_internal_exception(const E& e) {
*reinterpret_cast<std::exception_ptr*>(eos_vm_oc_get_exception_ptr()) = std::make_exception_ptr(e);
siglongjmp(*eos_vm_oc_get_jmp_buf(), EOSVMOC_EXIT_EXCEPTION);
__builtin_unreachable();
}
Expand All @@ -102,25 +103,42 @@ static void throw_internal_exception(const char* const s) {
void name()

DEFINE_EOSVMOC_TRAP_INTRINSIC(eosvmoc_internal,depth_assert) {
throw_internal_exception("Exceeded call depth maximum");
throw_internal_exception(wasm_execution_error(FC_LOG_MESSAGE(error, "Exceeded call depth maximum")));
}

DEFINE_EOSVMOC_TRAP_INTRINSIC(eosvmoc_internal,div0_or_overflow) {
throw_internal_exception("Division by 0 or integer overflow trapped");
throw_internal_exception(wasm_execution_error(FC_LOG_MESSAGE(error, "Division by 0 or integer overflow trapped")));
}

DEFINE_EOSVMOC_TRAP_INTRINSIC(eosvmoc_internal,indirect_call_mismatch) {
throw_internal_exception("Indirect call function type mismatch");
throw_internal_exception(wasm_execution_error(FC_LOG_MESSAGE(error, "Indirect call function type mismatch")));
}

DEFINE_EOSVMOC_TRAP_INTRINSIC(eosvmoc_internal,indirect_call_oob) {
throw_internal_exception("Indirect call index out of bounds");
throw_internal_exception(wasm_execution_error(FC_LOG_MESSAGE(error, "Indirect call index out of bounds")));
}

DEFINE_EOSVMOC_TRAP_INTRINSIC(eosvmoc_internal,unreachable) {
throw_internal_exception("Unreachable reached");
throw_internal_exception(wasm_execution_error(FC_LOG_MESSAGE(error, "Unreachable reached")));
}

static void eos_vm_oc_check_memcpy_params(int32_t dest, int32_t src, int32_t length) {
//make sure dest & src are zexted when converted from signed 32-bit to signed ptrdiff_t; length should always be small but do it too
const unsigned udest = dest;
const unsigned usrc = src;
const unsigned ulength = length;

//this must remain the same behavior as the memcpy host function
if((size_t)(std::abs((ptrdiff_t)udest - (ptrdiff_t)usrc)) >= ulength)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This check compiles down to about ~11 instructions but can still consume a substantial amount of CPU (beyond 1% in some cases). Perhaps something more clever can shave off another 0.5%.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wonder if

if ((std::max(udest, usrc) - std::min(udest, usrc)) >= ulength)

might be faster?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This check compiles down to about ~11 instructions but can still consume a substantial amount of CPU (beyond 1% in some cases). Perhaps something more clever can shave off another 0.5%.

Did you try plain if comparisons without using std::abs?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Both of these interesting suggestions and worth trying. Though, I'd suggest we keep the existing impl initially as it's most like what already exists (and has already been performance tested at the moment).

I'm also curious what would happen if the check was moved completely inline. The obviously bad aspect of this is that it's going to put a lot of pressure on the branch prediction cache. But.. there is a non-trivial amount of work just to get the values in to the right registers to pass off to a standard ABI function; then again maybe these get pipelined well. Plenty to experiment with here.

return;
throw_internal_exception(overlapping_memory_error(FC_LOG_MESSAGE(error, "memcpy can only accept non-aliasing pointers")));
}

static intrinsic check_memcpy_params_intrinsic("eosvmoc_internal.check_memcpy_params", IR::FunctionType::get(IR::ResultType::none,{IR::ValueType::i32,IR::ValueType::i32,IR::ValueType::i32}),
(void*)&eos_vm_oc_check_memcpy_params,
std::integral_constant<std::size_t, find_intrinsic_index("eosvmoc_internal.check_memcpy_params")>::value
);

struct executor_signal_init {
executor_signal_init() {
struct sigaction sig_action, old_sig_action;
Expand Down
81 changes: 81 additions & 0 deletions unittests/api_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <eosio/chain/wasm_interface.hpp>
#include <eosio/chain/resource_limits.hpp>
#include <eosio/chain/finalizer_authority.hpp>
#include <eosio/chain/webassembly/eos-vm-oc.hpp>

#include <fc/crypto/digest.hpp>
#include <fc/crypto/sha256.hpp>
Expand Down Expand Up @@ -3352,4 +3353,84 @@ BOOST_AUTO_TEST_CASE_TEMPLATE( get_code_hash_tests, T, validating_testers ) { tr
check("test"_n, 3);
} FC_LOG_AND_RETHROW() }

BOOST_AUTO_TEST_CASE_TEMPLATE( small_const_memcpy_tests, T, validating_testers ) { try {
T t;
t.create_account("smallmemcpy"_n);
t.produce_block();

for(unsigned i = eosvmoc::minimum_const_memcpy_intrinsic_to_optimize; i <= eosvmoc::maximum_const_memcpy_intrinsic_to_optimize; ++i) {
t.set_code("smallmemcpy"_n, fc::format_string(small_memcpy_const_dstsrc_wastfmt, fc::mutable_variant_object("COPY_SIZE", i)).c_str());

signed_transaction trx;
action act;
act.account = "smallmemcpy"_n;
act.name = ""_n;
act.authorization = vector<permission_level>{{"smallmemcpy"_n,config::active_name}};
act.data.push_back(i);
trx.actions.push_back(act);
t.set_transaction_headers(trx);
trx.sign(t.get_private_key( "smallmemcpy"_n, "active" ), t.get_chain_id());
t.push_transaction(trx);

if(i%10 == 0)
t.produce_block();
}

} FC_LOG_AND_RETHROW() }

//similar to above, but the source and destination values passed to memcpy are not consts
BOOST_AUTO_TEST_CASE_TEMPLATE( small_var_memcpy_tests, T, validating_testers ) { try {
T t;
t.create_account("smallmemcpy"_n);
t.produce_block();

for(unsigned i = eosvmoc::minimum_const_memcpy_intrinsic_to_optimize; i <= eosvmoc::maximum_const_memcpy_intrinsic_to_optimize; ++i) {
t.set_code("smallmemcpy"_n, fc::format_string(small_memcpy_var_dstsrc_wastfmt, fc::mutable_variant_object("COPY_SIZE", i)).c_str());

signed_transaction trx;
action act;
act.account = "smallmemcpy"_n;
act.name = ""_n;
act.authorization = vector<permission_level>{{"smallmemcpy"_n,config::active_name}};
act.data.push_back(i);
trx.actions.push_back(act);
t.set_transaction_headers(trx);
trx.sign(t.get_private_key( "smallmemcpy"_n, "active" ), t.get_chain_id());
t.push_transaction(trx);

if(i%10 == 0)
t.produce_block();
}

} FC_LOG_AND_RETHROW() }

//check that small constant sized memcpys (that OC will optimize "away") correctly fail on edge or high side of invalid memory
BOOST_AUTO_TEST_CASE_TEMPLATE( small_const_memcpy_oob_tests, T, validating_testers ) { try {
T t;
t.create_account("smallmemcpy"_n);
t.produce_block();

auto sendit = [&]() {
signed_transaction trx;
action act;
act.account = "smallmemcpy"_n;
act.name = ""_n;
act.authorization = vector<permission_level>{{"smallmemcpy"_n,config::active_name}};
trx.actions.push_back(act);
t.set_transaction_headers(trx);
trx.sign(t.get_private_key( "smallmemcpy"_n, "active" ), t.get_chain_id());
t.push_transaction(trx);
};

t.set_code("smallmemcpy"_n, small_memcpy_overlapenddst_wast);
BOOST_REQUIRE_EXCEPTION(sendit(), eosio::chain::wasm_execution_error, [](const eosio::chain::wasm_execution_error& e) {return expect_assert_message(e, "access violation");});

t.set_code("smallmemcpy"_n, small_memcpy_overlapendsrc_wast);
BOOST_REQUIRE_EXCEPTION(sendit(), eosio::chain::wasm_execution_error, [](const eosio::chain::wasm_execution_error& e) {return expect_assert_message(e, "access violation");});

t.set_code("smallmemcpy"_n, small_memcpy_high_wast);
BOOST_REQUIRE_EXCEPTION(sendit(), eosio::chain::wasm_execution_error, [](const eosio::chain::wasm_execution_error& e) {return expect_assert_message(e, "access violation");});

} FC_LOG_AND_RETHROW() }

BOOST_AUTO_TEST_SUITE_END()
80 changes: 80 additions & 0 deletions unittests/contracts/test_wasts.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1037,4 +1037,84 @@ static const char divmod_host_function_overflow_wast[] = R"=====(
))
)
)
)=====";

static const char small_memcpy_const_dstsrc_wastfmt[] = R"=====(
(module
(import "env" "memcpy" (func $$memcpy (param i32 i32 i32) (result i32)))
(import "env" "memcmp" (func $$memcmp (param i32 i32 i32) (result i32)))
(export "apply" (func $$apply))
(memory 1)
(func $$apply (param i64) (param i64) (param i64)
;; do copy and check that return value is dst
(if (i32.ne (call $$memcpy (i32.const 256) (i32.const 64) (i32.const ${COPY_SIZE})) (i32.const 256)) (then unreachable))

;; validate copied region
(if (i32.ne (call $$memcmp (i32.const 256) (i32.const 64) (i32.const ${COPY_SIZE})) (i32.const 0)) (then unreachable))

;; check the 4 bytes before and and after the copied region and expect them to still be 0x0
(if (i32.ne (i32.load (i32.const 252)) (i32.const 0)) (then unreachable))
(if (i32.ne (i32.load (i32.add (i32.const 256) (i32.const ${COPY_SIZE}))) (i32.const 0)) (then unreachable))
)
(data (i32.const 64) "1234567890-abcdefghijklmnopqrstuvwxyz_ABCDEFGHIJKLMNOPQRSTUVWXYZ")
)
)=====";

static const char small_memcpy_var_dstsrc_wastfmt[] = R"=====(
(module
(import "env" "memcpy" (func $$memcpy (param i32 i32 i32) (result i32)))
(import "env" "memcmp" (func $$memcmp (param i32 i32 i32) (result i32)))
(export "apply" (func $$apply))
(memory 1)
(func $$doit (param $$dst i32) (param $$src i32)
;; do copy and check that return value is dst
(if (i32.ne (call $$memcpy (get_local $$dst) (get_local $$src) (i32.const ${COPY_SIZE})) (get_local $$dst)) (then unreachable))

;; validate copied region
(if (i32.ne (call $$memcmp (get_local $$dst) (get_local $$src) (i32.const ${COPY_SIZE})) (i32.const 0)) (then unreachable))

;; check the 4 bytes before and and after the copied region and expect them to still be 0x0
(if (i32.ne (i32.load (i32.sub (get_local $$dst) (i32.const 4))) (i32.const 0)) (then unreachable))
(if (i32.ne (i32.load (i32.add (get_local $$dst) (i32.const ${COPY_SIZE}))) (i32.const 0)) (then unreachable))
)
(func $$apply (param i64) (param i64) (param i64)
(call $$doit (i32.const 256) (i32.const 64))
)
(data (i32.const 64) "1234567890-abcdefghijklmnopqrstuvwxyz_ABCDEFGHIJKLMNOPQRSTUVWXYZ")
)
)=====";

static const char small_memcpy_overlapenddst_wast[] = R"=====(
(module
(import "env" "memcpy" (func $memcpy (param i32 i32 i32) (result i32)))
(export "apply" (func $apply))
(memory 1)
(func $apply (param i64) (param i64) (param i64)
(drop (call $memcpy (i32.const 65532) (i32.const 64) (i32.const 8)))
)

(data (i32.const 64) "1234567890-abcdefghijklmnopqrstuvwxyz_ABCDEFGHIJKLMNOPQRSTUVWXYZ")
)
)=====";

static const char small_memcpy_overlapendsrc_wast[] = R"=====(
(module
(import "env" "memcpy" (func $memcpy (param i32 i32 i32) (result i32)))
(export "apply" (func $apply))
(memory 1)
(func $apply (param i64) (param i64) (param i64)
(drop (call $memcpy (i32.const 4) (i32.const 65532) (i32.const 8)))
)
)
)=====";

static const char small_memcpy_high_wast[] = R"=====(
(module
(import "env" "memcpy" (func $memcpy (param i32 i32 i32) (result i32)))
(export "apply" (func $apply))
(memory 1)
(func $apply (param i64) (param i64) (param i64)
(drop (call $memcpy (i32.const 4294967295) (i32.const 4294967200) (i32.const 8)))
)
)
)=====";