Skip to content

Commit

Permalink
Debug unstable system tests (#6486)
Browse files Browse the repository at this point in the history
## Motivation

This PR tries to improve instable system tests
  • Loading branch information
fasmat committed Nov 22, 2024
1 parent 30880eb commit 2a00078
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 58 deletions.
8 changes: 5 additions & 3 deletions systest/testcontext/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ var (
1,
"controls tests parallelization by creating multiple spacemesh clusters at the same time",
)
logLevel = zap.LevelFlag("level", zap.InfoLevel, "verbosity of the logger")
logLevel = zap.LevelFlag("level", zap.DebugLevel, "verbosity of the logger")
testTimeout = flag.Duration("test-timeout", 60*time.Minute, "timeout for a single test")

tokens chan struct{}
Expand Down Expand Up @@ -250,7 +250,8 @@ func updateContext(ctx *Context) error {
keep, err := strconv.ParseBool(keepval)
if err != nil {
ctx.Log.Panicw("invalid state. keep label should be parsable as a boolean",
"keepval", keepval)
"keepval", keepval,
)
}
ctx.Keep = ctx.Keep || keep

Expand All @@ -261,7 +262,8 @@ func updateContext(ctx *Context) error {
psize, err := strconv.Atoi(psizeval)
if err != nil {
ctx.Log.Panicw("invalid state. poet size label should be parsable as an integer",
"psizeval", psizeval)
"psizeval", psizeval,
)
}
ctx.PoetSize = psize
return nil
Expand Down
4 changes: 2 additions & 2 deletions systest/tests/checkpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ func TestCheckpoint(t *testing.T) {

tctx := testcontext.New(t)
addedLater := 2
size := min(tctx.ClusterSize, 30)
size := min(tctx.ClusterSize, 20)
oldSize := size - addedLater
if tctx.ClusterSize > oldSize {
tctx.Log.Info("cluster size changed to ", oldSize)
Expand Down Expand Up @@ -68,7 +68,7 @@ func TestCheckpoint(t *testing.T) {
stop := first + 2
receiver := types.GenerateAddress([]byte{11, 1, 1})
tctx.Log.Infow("sending transactions", "from", first, "to", stop-1)
require.NoError(t, sendTransactions(ctx, eg, tctx.Log, cl, first, stop, receiver, 1, 100))
require.NoError(t, sendTransactions(ctx, eg, tctx.Log.Desugar(), cl, first, stop, layerDuration, receiver, 1, 100))
require.NoError(t, eg.Wait())

require.NoError(t, waitLayer(tctx, cl.Client(0), snapshotLayer))
Expand Down
63 changes: 28 additions & 35 deletions systest/tests/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,64 +33,62 @@ var retryBackoff = 10 * time.Second
func sendTransactions(
ctx context.Context,
eg *errgroup.Group,
logger *zap.SugaredLogger,
logger *zap.Logger,
cl *cluster.Cluster,
first, stop uint32,
layerDuration time.Duration,
receiver types.Address,
batch, amount int,
) error {
for i := range cl.Accounts() {
client := cl.Client(i % cl.Total())
nonce, err := getNonce(ctx, client, cl.Address(i))
if err != nil {
return fmt.Errorf("get nonce failed (%s:%s): %w", client.Name, cl.Address(i), err)
return fmt.Errorf("get nonce failed (%s: %s): %w", client.Name, cl.Address(i), err)
}
deadline := cl.Genesis().Add(time.Duration(stop+1) * layerDuration)
minimum := time.Now().Add(2 * layerDuration)
if deadline.Before(minimum) { // make sure we have at least two layers to submit transactions
deadline = minimum
}
watchLayers(ctx, eg, client, logger.Desugar(), func(layer *pb.LayerStreamResponse) (bool, error) {
ctx, cancel := context.WithDeadline(ctx, deadline)
defer cancel()
watchLayers(ctx, eg, client, logger, func(layer *pb.LayerStreamResponse) (bool, error) {
if layer.Layer.Number.Number == stop {
return false, nil
}
if layer.Layer.Status != pb.Layer_LAYER_STATUS_APPROVED ||
layer.Layer.Number.Number < first {
if layer.Layer.Status != pb.Layer_LAYER_STATUS_APPLIED || layer.Layer.Number.Number < first {
return true, nil
}
// give some time for a previous layer to be applied
// TODO(dshulyak) introduce api that simply subscribes to internal clock
// and outputs events when the tick for the layer is available
time.Sleep(200 * time.Millisecond)
if nonce == 0 {
logger.Infow("address needs to be spawned", "account", i)
logger.Info("address needs to be spawned", zap.Stringer("address", cl.Address(i)))
if err := submitSpawn(ctx, cl, i, client); err != nil {
return false, fmt.Errorf("failed to spawn %w", err)
}
nonce++
return true, nil
}
logger.Debugw("submitting transactions",
"layer", layer.Layer.Number.Number,
"client", client.Name,
"account", i,
"nonce", nonce,
"batch", batch,
logger.Debug("submitting transactions",
zap.Uint32("layer", layer.Layer.Number.Number),
zap.String("client", client.Name),
zap.Stringer("address", cl.Address(i)),
zap.Uint64("nonce", nonce),
zap.Int("batch", batch),
)
for j := 0; j < batch; j++ {
for j := range batch {
// in case spawn isn't executed on this particular client
retries := 3
spendClient := client
for k := 0; k < retries; k++ {
for k := range retries {
err = submitSpend(ctx, cl, i, receiver, uint64(amount), nonce+uint64(j), spendClient)
if err == nil {
break
}
logger.Warnw(
"failed to spend",
"client",
spendClient.Name,
"account",
i,
"nonce",
nonce+uint64(j),
"err",
err.Error(),
logger.Warn("failed to spend",
zap.String("client", spendClient.Name),
zap.Stringer("address", cl.Address(i)),
zap.Uint64("nonce", nonce+uint64(j)),
zap.Error(err),
)
spendClient = cl.Client((i + k + 1) % cl.Total())
}
Expand Down Expand Up @@ -522,13 +520,8 @@ func submitSpend(
) error {
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
_, err := submitTransaction(ctx,
wallet.Spend(
cluster.Private(account), receiver, amount,
nonce,
sdk.WithGenesisID(cluster.GenesisID()),
),
client)
tx := wallet.Spend(cluster.Private(account), receiver, amount, nonce, sdk.WithGenesisID(cluster.GenesisID()))
_, err := submitTransaction(ctx, tx, client)
return err
}

Expand Down
27 changes: 17 additions & 10 deletions systest/tests/partition_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,11 @@ func testPartition(t *testing.T, tctx *testcontext.Context, cl *cluster.Cluster,
tctx.Log.Debug("sending transactions...")
eg2, ctx2 := errgroup.WithContext(tctx)
receiver := types.GenerateAddress([]byte{11, 1, 1})
require.NoError(t, sendTransactions(ctx2, eg2, tctx.Log, cl, first, stop, receiver, 10, 100))

layerDuration := testcontext.LayerDuration.Get(tctx.Parameters)
require.NoError(t,
sendTransactions(ctx2, eg2, tctx.Log.Desugar(), cl, first, stop, layerDuration, receiver, 10, 100),
)

type stateUpdate struct {
layer uint32
Expand Down Expand Up @@ -92,7 +96,8 @@ func testPartition(t *testing.T, tctx *testcontext.Context, cl *cluster.Cluster,
tctx.Log.Debugw("state hash collected",
"client", node.Name,
"layer", layer,
"state", stateHash.ShortString())
"state", stateHash.ShortString(),
)
stateCh <- &stateUpdate{
layer: layer,
hash: stateHash,
Expand Down Expand Up @@ -154,15 +159,17 @@ func testPartition(t *testing.T, tctx *testcontext.Context, cl *cluster.Cluster,
"ref_client", cl.Client(0).Name,
"layer", layer,
"client_hash", clientState[layer],
"ref_hash", refState[layer])
"ref_hash", refState[layer],
)
agree = false
break
}
}
if agree {
tctx.Log.Debugw("client agreed with ref client on all layers",
"client", cl.Client(i).Name,
"ref_client", cl.Client(0).Name)
"ref_client", cl.Client(0).Name,
)
}
pass = pass && agree
}
Expand All @@ -176,9 +183,9 @@ func TestPartition_30_70(t *testing.T) {
t.Parallel()

tctx := testcontext.New(t)
if tctx.ClusterSize > 30 {
tctx.Log.Info("cluster size changed to 30")
tctx.ClusterSize = 30
if tctx.ClusterSize > 20 {
tctx.Log.Info("cluster size changed to 20")
tctx.ClusterSize = 20
}
cl, err := cluster.ReuseWait(tctx, cluster.WithKeys(tctx.ClusterSize))
require.NoError(t, err)
Expand All @@ -191,9 +198,9 @@ func TestPartition_50_50(t *testing.T) {
t.Parallel()

tctx := testcontext.New(t)
if tctx.ClusterSize > 30 {
tctx.Log.Info("cluster size changed to 30")
tctx.ClusterSize = 30
if tctx.ClusterSize > 20 {
tctx.Log.Info("cluster size changed to 20")
tctx.ClusterSize = 20
}
cl, err := cluster.ReuseWait(tctx, cluster.WithKeys(tctx.ClusterSize))
require.NoError(t, err)
Expand Down
16 changes: 8 additions & 8 deletions systest/tests/transactions_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,18 @@ func testTransactions(
) {
var (
// start sending transactions after two layers or after genesis
first = max(currentLayer(tctx, tb, cl.Client(0))+2, 8)
stopSending = first + sendFor
batch = 10
amount = 100
first = max(currentLayer(tctx, tb, cl.Client(0))+2, 8)
stop = first + sendFor
batch = 10
amount = 100

// each account creates spawn transaction in the first layer
// plus batch number of spend transactions in every layer after that
expectedCount = cl.Accounts() * (1 + int(sendFor-1)*batch)
)
tctx.Log.Debugw("running transactions test",
"from", first,
"stop sending", stopSending,
"stop sending", stop,
"expected transactions", expectedCount,
)
receiver := types.GenerateAddress([]byte{11, 1, 1})
Expand All @@ -45,9 +45,9 @@ func testTransactions(
before := response.AccountWrapper.StateCurrent.Balance

eg, ctx := errgroup.WithContext(tctx)
require.NoError(
tb,
sendTransactions(ctx, eg, tctx.Log, cl, first, stopSending, receiver, batch, amount),
layerDuration := testcontext.LayerDuration.Get(tctx.Parameters)
require.NoError(tb,
sendTransactions(ctx, eg, tctx.Log.Desugar(), cl, first, stop, layerDuration, receiver, batch, amount),
)
txs := make([][]*pb.Transaction, cl.Total())

Expand Down

0 comments on commit 2a00078

Please sign in to comment.