Layr-Labs · cody-littley · Dec 16, 2024 · Nov 26, 2024 · Nov 27, 2024 · Nov 27, 2024
diff --git a/node/cmd/main.go b/node/cmd/main.go
@@ -85,8 +85,14 @@ func NodeMain(ctx *cli.Context) error {
 	}
 
 	// Creates the GRPC server.
+
+	// TODO(cody-littley): the metrics server is currently started by eigenmetrics, which is in another repo.
+	//  When we fully remove v1 support, we need to start the metrics server inside the v2 metrics code.
 	server := nodegrpc.NewServer(config, node, logger, ratelimiter)
-	serverV2 := nodegrpc.NewServerV2(config, node, logger, ratelimiter)
+	serverV2, err := nodegrpc.NewServerV2(config, node, logger, ratelimiter, reg)
+	if err != nil {
+		return fmt.Errorf("failed to create server v2: %v", err)
+	}
 	err = nodegrpc.RunServers(server, serverV2, config, logger)
 
 	return err

diff --git a/node/config.go b/node/config.go
@@ -52,7 +52,7 @@ type Config struct {
 	EnableNodeApi                  bool
 	NodeApiPort                    string
 	EnableMetrics                  bool
-	MetricsPort                    string
+	MetricsPort                    int
 	OnchainMetricsInterval         int64
 	Timeout                        time.Duration
 	RegisterNodeAtStart            bool
@@ -62,6 +62,7 @@ type Config struct {
 	OverrideStoreDurationBlocks    int64
 	QuorumIDList                   []core.QuorumID
 	DbPath                         string
+	DBSizePollPeriod               time.Duration
 	LogPath                        string
 	PrivateBls                     string
 	ID                             core.OperatorID
@@ -207,7 +208,7 @@ func NewConfig(ctx *cli.Context) (*Config, error) {
 		EnableNodeApi:                  ctx.GlobalBool(flags.EnableNodeApiFlag.Name),
 		NodeApiPort:                    ctx.GlobalString(flags.NodeApiPortFlag.Name),
 		EnableMetrics:                  ctx.GlobalBool(flags.EnableMetricsFlag.Name),
-		MetricsPort:                    ctx.GlobalString(flags.MetricsPortFlag.Name),
+		MetricsPort:                    ctx.GlobalInt(flags.MetricsPortFlag.Name),
 		OnchainMetricsInterval:         ctx.GlobalInt64(flags.OnchainMetricsIntervalFlag.Name),
 		Timeout:                        timeout,
 		RegisterNodeAtStart:            registerNodeAtStart,
@@ -218,6 +219,7 @@ func NewConfig(ctx *cli.Context) (*Config, error) {
 		OverrideStoreDurationBlocks:    ctx.GlobalInt64(flags.OverrideStoreDurationBlocksFlag.Name),
 		QuorumIDList:                   ids,
 		DbPath:                         ctx.GlobalString(flags.DbPathFlag.Name),
+		DBSizePollPeriod:               ctx.GlobalDuration(flags.DBSizePollPeriodFlag.Name),
 		PrivateBls:                     privateBls,
 		EthClientConfig:                ethClientConfig,
 		EncoderConfig:                  kzg.ReadCLIConfig(ctx),

diff --git a/node/flags/flags.go b/node/flags/flags.go
@@ -66,11 +66,11 @@ var (
 		Required: true,
 		EnvVar:   common.PrefixEnvVar(EnvVarPrefix, "ENABLE_METRICS"),
 	}
-	MetricsPortFlag = cli.StringFlag{
+	MetricsPortFlag = cli.IntFlag{
 		Name:     common.PrefixFlag(FlagPrefix, "metrics-port"),
 		Usage:    "Port at which node listens for metrics calls",
 		Required: false,
-		Value:    "9091",
+		Value:    9091,
 		EnvVar:   common.PrefixEnvVar(EnvVarPrefix, "METRICS_PORT"),
 	}
 	OnchainMetricsIntervalFlag = cli.StringFlag{
@@ -98,6 +98,13 @@ var (
 		Required: true,
 		EnvVar:   common.PrefixEnvVar(EnvVarPrefix, "DB_PATH"),
 	}
+	DBSizePollPeriodFlag = cli.DurationFlag{
+		Name:     common.PrefixFlag(FlagPrefix, "db-size-poll-period"),
+		Usage:    "The period at which the database size is polled. If set to 0, the database size is not polled.",
+		Required: false,
+		Value:    10 * time.Minute,
+		EnvVar:   common.PrefixEnvVar(EnvVarPrefix, "DB_SIZE_POLL_PERIOD"),
+	}
 	// The files for encrypted private keys.
 	BlsKeyFileFlag = cli.StringFlag{
 		Name:     common.PrefixFlag(FlagPrefix, "bls-key-file"),
@@ -384,6 +391,7 @@ var optionalFlags = []cli.Flag{
 	ChunkDownloadTimeoutFlag,
 	PprofHttpPort,
 	EnablePprof,
+	DBSizePollPeriodFlag,
 }
 
 func init() {

diff --git a/node/grpc/run.go b/node/grpc/run.go
@@ -24,6 +24,8 @@ func RunServers(serverV1 *Server, serverV2 *ServerV2, config *node.Config, logge
 		return errors.New("node V2 server is not configured")
 	}
 
+	serverV2.metrics.Start()
+
 	go func() {
 		for {
 			addr := fmt.Sprintf("%s:%s", localhost, config.InternalDispersalPort)
@@ -33,7 +35,7 @@ func RunServers(serverV1 *Server, serverV2 *ServerV2, config *node.Config, logge
 			}
 
 			opt := grpc.MaxRecvMsgSize(60 * 1024 * 1024 * 1024) // 60 GiB
-			gs := grpc.NewServer(opt)
+			gs := grpc.NewServer(opt, serverV2.metrics.GetGRPCServerOption())
 
 			// Register reflection service on gRPC server
 			// This makes "grpcurl -plaintext localhost:9000 list" command work
@@ -60,7 +62,7 @@ func RunServers(serverV1 *Server, serverV2 *ServerV2, config *node.Config, logge
 			}
 
 			opt := grpc.MaxRecvMsgSize(1024 * 1024 * 300) // 300 MiB
-			gs := grpc.NewServer(opt)
+			gs := grpc.NewServer(opt, serverV2.metrics.GetGRPCServerOption())
 
 			// Register reflection service on gRPC server
 			// This makes "grpcurl -plaintext localhost:9000 list" command work

diff --git a/node/grpc/server_v2.go b/node/grpc/server_v2.go
@@ -4,8 +4,6 @@ import (
 	"context"
 	"encoding/hex"
 	"fmt"
-	"runtime"
-
 	"github.com/Layr-Labs/eigenda/api"
 	pb "github.com/Layr-Labs/eigenda/api/grpc/node/v2"
 	"github.com/Layr-Labs/eigenda/common"
@@ -14,7 +12,10 @@ import (
 	corev2 "github.com/Layr-Labs/eigenda/core/v2"
 	"github.com/Layr-Labs/eigenda/node"
 	"github.com/Layr-Labs/eigensdk-go/logging"
+	"github.com/prometheus/client_golang/prometheus"
 	"github.com/shirou/gopsutil/mem"
+	"runtime"
+	"time"
 )
 
 // ServerV2 implements the Node v2 proto APIs.
@@ -26,6 +27,7 @@ type ServerV2 struct {
 	node        *node.Node
 	ratelimiter common.RateLimiter
 	logger      logging.Logger
+	metrics     *V2Metrics
 }
 
 // NewServerV2 creates a new Server instance with the provided parameters.
@@ -34,13 +36,20 @@ func NewServerV2(
 	node *node.Node,
 	logger logging.Logger,
 	ratelimiter common.RateLimiter,
-) *ServerV2 {
+	registry *prometheus.Registry) (*ServerV2, error) {
+
+	metrics, err := NewV2Metrics(logger, registry, config.DbPath, config.DBSizePollPeriod)
+	if err != nil {
+		return nil, err
+	}
+
 	return &ServerV2{
 		config:      config,
 		node:        node,
 		ratelimiter: ratelimiter,
 		logger:      logger,
-	}
+		metrics:     metrics,
+	}, nil
 }
 
 func (s *ServerV2) NodeInfo(ctx context.Context, in *pb.NodeInfoRequest) (*pb.NodeInfoReply, error) {
@@ -58,6 +67,8 @@ func (s *ServerV2) NodeInfo(ctx context.Context, in *pb.NodeInfoRequest) (*pb.No
 }
 
 func (s *ServerV2) StoreChunks(ctx context.Context, in *pb.StoreChunksRequest) (*pb.StoreChunksReply, error) {
+	start := time.Now()
+
 	if !s.config.EnableV2 {
 		return nil, api.NewErrorInvalidArg("v2 API is disabled")
 	}
@@ -92,7 +103,7 @@ func (s *ServerV2) StoreChunks(ctx context.Context, in *pb.StoreChunksRequest) (
 	}
 	storeChan := make(chan storeResult)
 	go func() {
-		keys, err := s.node.StoreV2.StoreBatch(batch, rawBundles)
+		keys, size, err := s.node.StoreV2.StoreBatch(batch, rawBundles)
 		if err != nil {
 			storeChan <- storeResult{
 				keys: nil,
@@ -101,6 +112,8 @@ func (s *ServerV2) StoreChunks(ctx context.Context, in *pb.StoreChunksRequest) (
 			return
 		}
 
+		s.metrics.ReportStoreChunksDataSize(size)
+
 		storeChan <- storeResult{
 			keys: keys,
 			err:  nil,
@@ -124,6 +137,9 @@ func (s *ServerV2) StoreChunks(ctx context.Context, in *pb.StoreChunksRequest) (
 	}
 
 	sig := s.node.KeyPair.SignMessage(batchHeaderHash).Bytes()
+
+	s.metrics.ReportStoreChunksLatency(time.Since(start))
+
 	return &pb.StoreChunksReply{
 		Signature: sig[:],
 	}, nil
@@ -144,6 +160,8 @@ func (s *ServerV2) validateStoreChunksRequest(req *pb.StoreChunksRequest) (*core
 }
 
 func (s *ServerV2) GetChunks(ctx context.Context, in *pb.GetChunksRequest) (*pb.GetChunksReply, error) {
+	start := time.Now()
+
 	if !s.config.EnableV2 {
 		return nil, api.NewErrorInvalidArg("v2 API is disabled")
 	}
@@ -166,6 +184,15 @@ func (s *ServerV2) GetChunks(ctx context.Context, in *pb.GetChunksRequest) (*pb.
 		return nil, api.NewErrorInternal(fmt.Sprintf("failed to get chunks: %v", err))
 	}
 
+	size := 0
+	if len(chunks) > 0 {
+		size = len(chunks[0]) * len(chunks)
+	}
+	s.metrics.ReportGetChunksDataSize(size)
+
+	elapsed := time.Since(start)
+	s.metrics.ReportGetChunksLatency(elapsed)
+
 	return &pb.GetChunksReply{
 		Chunks: chunks,
 	}, nil

diff --git a/node/grpc/server_v2_test.go b/node/grpc/server_v2_test.go
@@ -82,7 +82,8 @@ func newTestComponents(t *testing.T, config *node.Config) *testComponents {
 		RelayClient: atomicRelayClient,
 	}
 	node.BlobVersionParams.Store(v2.NewBlobVersionParameterMap(blobParamsMap))
-	server := grpc.NewServerV2(config, node, logger, ratelimiter)
+	server, err := grpc.NewServerV2(config, node, logger, ratelimiter, prometheus.NewRegistry())
+	require.NoError(t, err)
 	return &testComponents{
 		server:      server,
 		node:        node,