Skip to content

Commit

Permalink
remove macOS/aarch64 workaround from proposeBlockAux (#6138)
Browse files Browse the repository at this point in the history
  • Loading branch information
tersec authored Mar 26, 2024
1 parent 54d5699 commit 605bf99
Showing 1 changed file with 0 additions and 131 deletions.
131 changes: 0 additions & 131 deletions beacon_chain/validators/beacon_validators.nim
Original file line number Diff line number Diff line change
Expand Up @@ -1147,137 +1147,6 @@ proc proposeBlockAux(
blobsBundle.proofs, blobsBundle.blobs))
else:
Opt.none(seq[BlobSidecar])

# BIG BUG SOURCE: The `let` below cannot be combined with the others above!
# If combined, there are sometimes `SIGSEGV` during `test_keymanager_api`.
# This has only been observed on macOS (aarch64) in Jenkins, not on GitHub.
#
# - macOS 14.2.1 (23C71)
# - Xcode 15.1 (15C65)
# - Nim v1.6.18 (a749a8b742bd0a4272c26a65517275db4720e58a)
#
# Issue has started occuring around 12 Jan 2024, in a CI run for PR #5731.
# The PR did not change anything related to this, suggesting an environment
# or hardware change. The issue is flaky; could have been introduced earlier
# before surfacing in the aforementioned PR. About 30% to hit bug.
#
# [2024-01-12T11:54:21.011Z] Wrote test_keymanager_api/bootstrap_node.enr
# [2024-01-12T11:54:29.294Z] Serialization/deserialization [Beacon Node] [Preset: mainnet] . (0.00s)
# [2024-01-12T11:54:29.294Z] ListKeys requests [Beacon Node] [Preset: mainnet] .... (0.01s)
# [2024-01-12T11:54:34.870Z] ImportKeystores requests [Beacon Node] [Preset: mainnet] Traceback (most recent call last, using override)
# [2024-01-12T11:54:34.870Z] vendor/nim-libp2p/libp2p/protocols/rendezvous.nim(1016) main
# [2024-01-12T11:54:34.870Z] vendor/nim-libp2p/libp2p/protocols/rendezvous.nim(1006) NimMain
# [2024-01-12T11:54:34.870Z] vendor/nim-libp2p/libp2p/protocols/rendezvous.nim(997) PreMain
# [2024-01-12T11:54:34.870Z] tests/test_keymanager_api.nim(1502) atmtest_keymanager_apidotnim_Init000
# [2024-01-12T11:54:34.870Z] tests/test_keymanager_api.nim(1475) main
# [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(378) futureContinue
# [2024-01-12T11:54:34.870Z] tests/test_keymanager_api.nim(1481) main
# [2024-01-12T11:54:34.870Z] tests/test_keymanager_api.nim(307) startBeaconNode
# [2024-01-12T11:54:34.870Z] beacon_chain/nimbus_beacon_node.nim(1900) start
# [2024-01-12T11:54:34.870Z] beacon_chain/nimbus_beacon_node.nim(1847) run
# [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncengine.nim(150) poll
# [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(378) futureContinue
# [2024-01-12T11:54:34.870Z] tests/test_keymanager_api.nim(1465) delayedTests
# [2024-01-12T11:54:34.870Z] tests/test_keymanager_api.nim(392) runTests
# [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(378) futureContinue
# [2024-01-12T11:54:34.870Z] vendor/nim-unittest2/unittest2.nim(1147) runTests
# [2024-01-12T11:54:34.870Z] vendor/nim-unittest2/unittest2.nim(1086) runDirect
# [2024-01-12T11:54:34.870Z] vendor/nim-testutils/testutils/unittests.nim(16) runTestX60gensym2933
# [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(656) waitFor
# [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(631) pollFor
# [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncengine.nim(150) poll
# [2024-01-12T11:54:34.870Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(378) futureContinue
# [2024-01-12T11:54:34.870Z] beacon_chain/validators/beacon_validators.nim(82) proposeBlockAux
# [2024-01-12T11:54:34.870Z] vendor/nimbus-build-system/vendor/Nim/lib/system/excpt.nim(631) signalHandler
# [2024-01-12T11:54:34.870Z] SIGSEGV: Illegal storage access. (Attempt to read from nil?)
#
# This appeared again around 25 Feb 2024, in a CI run for PR #5959,
# despite the extra `let` having been applied -- once more observed on
# macOS (aarch64) in Jenkins, and much rarer than before.
#
# [2024-02-25T23:21:24.533Z] Wrote test_keymanager_api/bootstrap_node.enr
# [2024-02-25T23:21:32.756Z] Serialization/deserialization [Beacon Node] [Preset: mainnet] . (0.00s)
# [2024-02-25T23:21:32.756Z] ListKeys requests [Beacon Node] [Preset: mainnet] .... (0.01s)
# [2024-02-25T23:21:37.219Z] ImportKeystores requests [Beacon Node] [Preset: mainnet] Traceback (most recent call last, using override)
# [2024-02-25T23:21:37.219Z] vendor/nim-libp2p/libp2p/protocols/pubsub/pubsub.nim(1068) main
# [2024-02-25T23:21:37.219Z] vendor/nim-libp2p/libp2p/protocols/pubsub/pubsub.nim(1058) NimMain
# [2024-02-25T23:21:37.219Z] vendor/nim-libp2p/libp2p/protocols/pubsub/pubsub.nim(1049) PreMain
# [2024-02-25T23:21:37.219Z] tests/test_keymanager_api.nim(1501) atmtest_keymanager_apidotnim_Init000
# [2024-02-25T23:21:37.219Z] tests/test_keymanager_api.nim(1474) main
# [2024-02-25T23:21:37.219Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(382) futureContinue
# [2024-02-25T23:21:37.219Z] tests/test_keymanager_api.nim(1480) main
# [2024-02-25T23:21:37.219Z] tests/test_keymanager_api.nim(307) startBeaconNode
# [2024-02-25T23:21:37.219Z] beacon_chain/nimbus_beacon_node.nim(1916) start
# [2024-02-25T23:21:37.219Z] beacon_chain/nimbus_beacon_node.nim(1863) run
# [2024-02-25T23:21:37.219Z] vendor/nim-chronos/chronos/internal/asyncengine.nim(150) poll
# [2024-02-25T23:21:37.219Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(382) futureContinue
# [2024-02-25T23:21:37.219Z] tests/test_keymanager_api.nim(1464) delayedTests
# [2024-02-25T23:21:37.219Z] tests/test_keymanager_api.nim(391) runTests
# [2024-02-25T23:21:37.219Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(382) futureContinue
# [2024-02-25T23:21:37.219Z] vendor/nim-unittest2/unittest2.nim(1151) runTests
# [2024-02-25T23:21:37.219Z] vendor/nim-unittest2/unittest2.nim(1086) runDirect
# [2024-02-25T23:21:37.219Z] vendor/nim-testutils/testutils/unittests.nim(16) runTestX60gensym3188
# [2024-02-25T23:21:37.219Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(660) waitFor
# [2024-02-25T23:21:37.219Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(635) pollFor
# [2024-02-25T23:21:37.219Z] vendor/nim-chronos/chronos/internal/asyncengine.nim(150) poll
# [2024-02-25T23:21:37.219Z] vendor/nim-chronos/chronos/internal/asyncfutures.nim(382) futureContinue
# [2024-02-25T23:21:37.219Z] vendor/nim-chronicles/chronicles.nim(251) proposeBlockAux
# [2024-02-25T23:21:37.219Z] SIGSEGV: Illegal storage access. (Attempt to read from nil?)
#
# One theory is that PR #5946 may increase the frequency, as there were
# times where the Jenkins CI failed almost every time using a shorter trace.
# However, the problem was once more flaky, with some passes in-between.
# For now, PR #5946 was reverted (low priority), and the problem is gone,
# whether related or not.
#
# [2024-02-23T23:11:47.700Z] Wrote test_keymanager_api/bootstrap_node.enr
# [2024-02-23T23:11:54.728Z] Serialization/deserialization [Beacon Node] [Preset: mainnet] . (0.00s)
# [2024-02-23T23:11:54.728Z] ListKeys requests [Beacon Node] [Preset: mainnet] .... (0.01s)
# [2024-02-23T23:11:59.523Z] ImportKeystores requests [Beacon Node] [Preset: mainnet] Traceback (most recent call last, using override)
# [2024-02-23T23:11:59.523Z] vendor/nim-libp2p/libp2p/protocols/pubsub/pubsub.nim(1067) main
# [2024-02-23T23:11:59.523Z] vendor/nim-libp2p/libp2p/protocols/pubsub/pubsub.nim(1057) NimMain
# [2024-02-23T23:11:59.523Z] vendor/nim-chronos/chronos/internal/asyncengine.nim(150) poll
# [2024-02-23T23:11:59.523Z] vendor/nim-chronos/chronos/internal/asyncengine.nim(150) poll
# [2024-02-23T23:11:59.523Z] SIGSEGV: Illegal storage access. (Attempt to read from nil?)
#
# The generated `nimcache` differs slightly if the `let` are separated from
# a single block; separation introduces an additional state in closure iter.
# This change, maybe combined with some macOS specific compiler specifics,
# could this trigger the `SIGSEGV`? Maybe the extra state adds just enough
# complexity to the function to disable certain problematic optimizations?
# The change in size of the environment changes a number of things such as
# alignment and which parts of an environment contain pointers and so on,
# which in turn may have surprising behavioural effects, ie most likely this
# extra state masks some underlying issue. Furthermore, the combination of
# `(await xyz).valueOr: return` is not very commonly used with other `await`
# in the same `let` block, which could explain this not being more common.
#
# Note that when compiling for Wasm, there are similar bugs with `results`
# when inlining unwraps, e.g., in `eth2_rest_serialization.nim`.
# These have not been investigated thoroughly so far as that project uses
# Nim 2.0 with --mm:orc and is just a prototype for Wasm, no production use.
# But maybe there is something weird going on with `results` related to the
# random `SIGSEGV` that we are now observing here, related to doing too much
# inline logic without defining intermediate isolated `let` statements.
#
# if mediaType == ApplicationJsonMediaType:
# try:
# - ok RestJson.decode(value, T,
# - requireAllFields = true,
# - allowUnknownFields = true)
# + let r = RestJson.decode(value, T,
# + requireAllFields = true,
# + allowUnknownFields = true)
# + ok r
# except SerializationError as exc:
# debug "Failed to deserialize REST JSON data",
# err = exc.formatMsg("<data>"),
#
# At this time we can only speculate about the trigger of these issues.
# Until a shared pattern can be identified, it is better to apply
# workarounds that at least avoid the known to be reachable triggers.
# The solution is hacky and far from desirable; it is what it is.
let
newBlockRef = (
await node.router.routeSignedBeaconBlock(signedBlock, blobsOpt)
).valueOr:
Expand Down

0 comments on commit 605bf99

Please sign in to comment.