Skip to content

Commit

Permalink
Improve stability of system tests (#6486)
Browse files Browse the repository at this point in the history
## Motivation

This PR tries to improve unstable system tests
  • Loading branch information
fasmat committed Nov 27, 2024
1 parent 1081a13 commit 514f296
Show file tree
Hide file tree
Showing 14 changed files with 236 additions and 186 deletions.
2 changes: 1 addition & 1 deletion Makefile-libs.Inc
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ else
endif
endif

POSTRS_SETUP_REV = 0.8.1
POSTRS_SETUP_REV = 0.8.4
POSTRS_SETUP_ZIP = libpost-$(platform)-v$(POSTRS_SETUP_REV).zip
POSTRS_SETUP_URL_ZIP ?= https://github.com/spacemeshos/post-rs/releases/download/v$(POSTRS_SETUP_REV)/$(POSTRS_SETUP_ZIP)

Expand Down
22 changes: 21 additions & 1 deletion fetch/mesh_data.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/spacemeshos/go-scale"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
"golang.org/x/sync/errgroup"

"github.com/spacemeshos/go-spacemesh/codec"
Expand Down Expand Up @@ -177,20 +178,39 @@ func (f *Fetch) GetBlocks(ctx context.Context, ids []types.BlockID) error {

// GetProposalTxs fetches the txs provided as IDs and validates them, returns an error if one TX failed to be fetched.
func (f *Fetch) GetProposalTxs(ctx context.Context, ids []types.TransactionID) error {
f.logger.Debug("requesting proposal txs from peer",
log.ZContext(ctx),
zap.Int("num_txs", len(ids)),
zap.Array("txs", zapcore.ArrayMarshalerFunc(func(enc zapcore.ArrayEncoder) error {
for _, id := range ids {
enc.AppendString(id.ShortString())
}
return nil
})),
)
return f.getTxs(ctx, ids, f.validators.txProposal.HandleMessage)
}

// GetBlockTxs fetches the txs provided as IDs and saves them, they will be validated
// before block is applied.
func (f *Fetch) GetBlockTxs(ctx context.Context, ids []types.TransactionID) error {
f.logger.Debug("requesting block txs from peer",
log.ZContext(ctx),
zap.Int("num_txs", len(ids)),
zap.Array("txs", zapcore.ArrayMarshalerFunc(func(enc zapcore.ArrayEncoder) error {
for _, id := range ids {
enc.AppendString(id.ShortString())
}
return nil
})),
)
return f.getTxs(ctx, ids, f.validators.txBlock.HandleMessage)
}

func (f *Fetch) getTxs(ctx context.Context, ids []types.TransactionID, receiver dataReceiver) error {
if len(ids) == 0 {
return nil
}
f.logger.Debug("requesting txs from peer", log.ZContext(ctx), zap.Int("num_txs", len(ids)))
hashes := types.TransactionIDsToHashes(ids)
return f.getHashes(ctx, hashes, datastore.TXDB, receiver)
}
Expand Down
4 changes: 3 additions & 1 deletion miner/proposal_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,9 @@ func (pb *ProposalBuilder) initSharedData(ctx context.Context, current types.Lay
//
// Additionally all activesets that are older than 2 epochs are deleted at the beginning of an epoch anyway, but
// maybe we should revisit this when activesets are no longer bootstrapped.
return pb.db.WithTx(ctx, func(tx sql.Transaction) error {
//
// TODO(mafa): I'm still seeing SQL_BUSY errors in the logs, so for now I change this back to TxImmediate.
return pb.db.WithTxImmediate(ctx, func(tx sql.Transaction) error {
yes, err := activesets.Has(tx, pb.shared.active.id)
if err != nil {
return err
Expand Down
17 changes: 10 additions & 7 deletions systest/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,20 @@ tmpfile := $(shell mktemp /tmp/systest-XXX)
test_name ?= TestSmeshing
org ?= spacemeshos
image_name ?= $(org)/systest:$(version_info)
certifier_image ?= $(org)/certifier-service:v0.7.13
certifier_image ?= $(org)/certifier-service:v0.8.4
poet_image ?= $(org)/poet:v0.10.10
post_service_image ?= $(org)/post-service:v0.7.13
post_init_image ?= $(org)/postcli:v0.12.5
post_service_image ?= $(org)/post-service:v0.8.4
post_init_image ?= $(org)/postcli:v0.12.10
smesher_image ?= $(org)/go-spacemesh-dev:$(version_info)
old_smesher_image ?= $(org)/go-spacemesh-dev:7b9337a # Update this when new version is released
old_smesher_image ?= $(org)/go-spacemesh-dev:v1.7.7
bs_image ?= $(org)/go-spacemesh-dev-bs:$(version_info)

test_id ?= systest-$(version_info)
test_job_name ?= systest-$(version_info)-$(date)
keep ?= false
clusters ?= 1
size ?= 10
poet_size ?= 3
poet_size ?= 2
level ?= debug
bootstrap ?= 5m
storage ?= standard=1Gi
Expand All @@ -38,8 +38,11 @@ ifeq ($(configname),$(test_job_name))
run_deps = config
endif

command := gotestsum --raw-command -- test2json -t -p systest \
/bin/tests -test.v -test.count=$(count) -test.timeout=60m -test.run=$(test_name) -test.parallel=$(clusters) \
# command := gotestsum --raw-command -- test2json -t -p systest \
/bin/tests -test.v -test.count=$(count) -test.timeout=60m -test.run=$(test_name) -test.parallel=$(clusters) \
-test.failfast=$(failfast) -clusters=$(clusters) -level=$(level) -configname=$(configname)

command := /bin/tests -test.v -test.count=$(count) -test.timeout=60m -test.run=$(test_name) -test.parallel=$(clusters) \
-test.failfast=$(failfast) -clusters=$(clusters) -level=$(level) -configname=$(configname)

.PHONY: docker
Expand Down
11 changes: 8 additions & 3 deletions systest/cluster/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -878,8 +878,7 @@ func deployNode(
corev1.Volume().WithName("config").
WithConfigMap(corev1.ConfigMapVolumeSource().WithName(spacemeshConfigMapName)),
corev1.Volume().WithName("data").
WithEmptyDir(corev1.EmptyDirVolumeSource().
WithSizeLimit(resource.MustParse(ctx.Storage.Size))),
WithEmptyDir(corev1.EmptyDirVolumeSource().WithSizeLimit(resource.MustParse(ctx.Storage.Size))),
).
WithDNSConfig(corev1.PodDNSConfig().WithOptions(
corev1.PodDNSConfigOption().WithName("timeout").WithValue("1"),
Expand Down Expand Up @@ -911,7 +910,13 @@ func deployNode(
).WithInitialDelaySeconds(10).WithPeriodSeconds(10),
).
WithEnv(
corev1.EnvVar().WithName("GOMAXPROCS").WithValue("4"),
corev1.EnvVar().WithName("GOMAXPROCS").WithValueFrom(
corev1.EnvVarSource().WithResourceFieldRef(
corev1.ResourceFieldSelector().
WithResource("limits.cpu").
WithDivisor(resource.MustParse("1")),
),
),
).
WithCommand(cmd...),
)
Expand Down
8 changes: 5 additions & 3 deletions systest/testcontext/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ var (
10,
)
poetSize = parameters.Int(
"poet-size", "size of the poet servers", 1,
"poet-size", "size of the poet servers", 2,
)
bsSize = parameters.Int(
"bs-size", "size of bootstrappers", 1,
Expand Down Expand Up @@ -250,7 +250,8 @@ func updateContext(ctx *Context) error {
keep, err := strconv.ParseBool(keepval)
if err != nil {
ctx.Log.Panicw("invalid state. keep label should be parsable as a boolean",
"keepval", keepval)
"keepval", keepval,
)
}
ctx.Keep = ctx.Keep || keep

Expand All @@ -261,7 +262,8 @@ func updateContext(ctx *Context) error {
psize, err := strconv.Atoi(psizeval)
if err != nil {
ctx.Log.Panicw("invalid state. poet size label should be parsable as an integer",
"psizeval", psizeval)
"psizeval", psizeval,
)
}
ctx.PoetSize = psize
return nil
Expand Down
23 changes: 11 additions & 12 deletions systest/tests/checkpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,9 @@ func TestCheckpoint(t *testing.T) {

tctx := testcontext.New(t)
addedLater := 2
size := min(tctx.ClusterSize, 30)
oldSize := size - addedLater
if tctx.ClusterSize > oldSize {
tctx.Log.Info("cluster size changed to ", oldSize)
tctx.ClusterSize = oldSize
}
oldSize := tctx.ClusterSize - addedLater
tctx.Log.Info("cluster size changed to ", oldSize)
tctx.ClusterSize = oldSize

// at the last layer of epoch 3, in the beginning of poet round 2.
// it is important to avoid check-pointing in the middle of cycle gap
Expand All @@ -63,14 +60,15 @@ func TestCheckpoint(t *testing.T) {
require.EqualValues(t, 4, layersPerEpoch, "checkpoint layer require tuning as layersPerEpoch is changed")
layerDuration := testcontext.LayerDuration.Get(tctx.Parameters)

eg, ctx := errgroup.WithContext(tctx)
first := layersPerEpoch * 2
stop := first + 2
receiver := types.GenerateAddress([]byte{11, 1, 1})
tctx.Log.Infow("sending transactions", "from", first, "to", stop-1)
require.NoError(t, sendTransactions(ctx, eg, tctx.Log, cl, first, stop, receiver, 1, 100))
require.NoError(t, eg.Wait())

deadline := cl.Genesis().Add(time.Duration(stop+1) * layerDuration)
ctx, cancel := context.WithDeadline(tctx, deadline)
defer cancel()
require.NoError(t, sendTransactions(ctx, tctx.Log.Desugar(), cl, first, stop, receiver, 1, 100))
require.NoError(t, waitLayer(tctx, cl.Client(0), snapshotLayer))

tctx.Log.Debugw("getting account balances")
Expand Down Expand Up @@ -100,7 +98,8 @@ func TestCheckpoint(t *testing.T) {
diffs = append(diffs, cl.Client(i).Name)
tctx.Log.Errorw("diff checkpoint data",
fmt.Sprintf("reference %v", cl.Client(0).Name), string(checkpoints[0]),
fmt.Sprintf("client %v", cl.Client(i).Name), string(checkpoints[i]))
fmt.Sprintf("client %v", cl.Client(i).Name), string(checkpoints[i]),
)
}
}
require.Empty(t, diffs)
Expand Down Expand Up @@ -173,8 +172,8 @@ func TestCheckpoint(t *testing.T) {
ensureSmeshing(t, tctx, cl, checkpointEpoch)

// increase the cluster size to the original test size
tctx.Log.Info("cluster size changed to ", size)
tctx.ClusterSize = size
tctx.ClusterSize += addedLater
tctx.Log.Info("cluster size changed to ", tctx.ClusterSize)
require.NoError(t, cl.AddSmeshers(tctx, addedLater))

tctx.Log.Infow("waiting for all miners to be smeshing", "last epoch", lastEpoch)
Expand Down
Loading

0 comments on commit 514f296

Please sign in to comment.