From 0e6fa63d2f7006aa828c6af5799ced5f947462ba Mon Sep 17 00:00:00 2001 From: Michal Swietek <4404982+mike1729@users.noreply.github.com> Date: Mon, 23 Dec 2024 15:21:38 +0100 Subject: [PATCH] A0-3022: Abft Score Metrics (#1898) Co-authored-by: Michal Swietek --- bin/chain-bootstrapper/src/chain_spec/cli.rs | 15 ++++++--- .../src/abft/current/performance/service.rs | 8 +++++ finality-aleph/src/metrics/abft_score.rs | 31 +++++++++++++++++++ finality-aleph/src/metrics/mod.rs | 2 ++ finality-aleph/src/nodes.rs | 10 ++++-- finality-aleph/src/party/manager/mod.rs | 28 +++++++++++------ finality-aleph/src/party/mod.rs | 2 ++ primitives/src/lib.rs | 2 +- scripts/run_nodes.sh | 13 +++++++- 9 files changed, 93 insertions(+), 18 deletions(-) create mode 100644 finality-aleph/src/metrics/abft_score.rs diff --git a/bin/chain-bootstrapper/src/chain_spec/cli.rs b/bin/chain-bootstrapper/src/chain_spec/cli.rs index 9c57436fc1..a6a5c80120 100644 --- a/bin/chain-bootstrapper/src/chain_spec/cli.rs +++ b/bin/chain-bootstrapper/src/chain_spec/cli.rs @@ -1,4 +1,6 @@ -use primitives::{AccountId, Version as FinalityVersion, LEGACY_FINALITY_VERSION}; +use primitives::{ + AccountId, Version as FinalityVersion, CURRENT_FINALITY_VERSION, LEGACY_FINALITY_VERSION, +}; use sc_chain_spec::ChainType; use sc_cli::clap::{self, Args}; @@ -43,8 +45,8 @@ pub struct ChainSpecParams { rich_account_ids: Option>, /// Finality version at chain inception. - #[arg(long, default_value = LEGACY_FINALITY_VERSION.to_string())] - finality_version: FinalityVersion, + #[arg(long, default_value = "legacy")] + finality_version: String, } impl ChainSpecParams { @@ -81,6 +83,11 @@ impl ChainSpecParams { } pub fn finality_version(&self) -> FinalityVersion { - self.finality_version + match self.finality_version.as_str() { + "current" => CURRENT_FINALITY_VERSION, + "legacy" => LEGACY_FINALITY_VERSION, + _ => panic!("finality version should be 'current' or 'legacy'"), + } + .into() } } diff --git a/finality-aleph/src/abft/current/performance/service.rs b/finality-aleph/src/abft/current/performance/service.rs index 88928f8da2..06b2db2abb 100644 --- a/finality-aleph/src/abft/current/performance/service.rs +++ b/finality-aleph/src/abft/current/performance/service.rs @@ -11,6 +11,7 @@ use crate::{ LOG_TARGET, }, data_io::AlephData, + metrics::ScoreMetrics, party::manager::Runnable, Hasher, UnverifiedHeader, }; @@ -62,8 +63,10 @@ pub struct Service where UH: UnverifiedHeader, { + my_index: usize, batches_from_abft: mpsc::UnboundedReceiver>, scorer: Scorer, + metrics: ScoreMetrics, } impl Service @@ -73,8 +76,10 @@ where /// Create a new service, together with a unit finalization handler that should be passed to /// ABFT. It will wrap the provided finalization handler and call it in the background. pub fn new( + my_index: usize, n_members: usize, finalization_handler: FH, + metrics: ScoreMetrics, ) -> ( Self, impl current_aleph_bft::UnitFinalizationHandler, Hasher = Hasher>, @@ -85,8 +90,10 @@ where let (batches_for_us, batches_from_abft) = mpsc::unbounded(); ( Service { + my_index, batches_from_abft, scorer: Scorer::new(NodeCount(n_members)), + metrics, }, FinalizationWrapper::new(finalization_handler, batches_for_us), ) @@ -110,6 +117,7 @@ where }, }; debug!(target: LOG_TARGET, "Received ABFT score: {:?}.", score); + self.metrics.report_score(score[self.my_index]); // TODO(A0-4339): sometimes submit these scores to the chain. } _ = &mut exit => { diff --git a/finality-aleph/src/metrics/abft_score.rs b/finality-aleph/src/metrics/abft_score.rs new file mode 100644 index 0000000000..35336ca7a9 --- /dev/null +++ b/finality-aleph/src/metrics/abft_score.rs @@ -0,0 +1,31 @@ +use substrate_prometheus_endpoint::{register, Gauge, PrometheusError, Registry, U64}; + +#[derive(Clone)] +pub enum ScoreMetrics { + Prometheus { my_score: Gauge }, + Noop, +} + +impl ScoreMetrics { + pub fn new(registry: Option) -> Result { + match registry { + Some(registry) => Ok(ScoreMetrics::Prometheus { + my_score: register( + Gauge::new("my_abft_score", "My abft score observed in last batch")?, + ®istry, + )?, + }), + None => Ok(ScoreMetrics::Noop), + } + } + + pub fn noop() -> Self { + ScoreMetrics::Noop + } + + pub fn report_score(&self, score: u16) { + if let ScoreMetrics::Prometheus { my_score } = self { + my_score.set(score.into()); + } + } +} diff --git a/finality-aleph/src/metrics/mod.rs b/finality-aleph/src/metrics/mod.rs index 4ea3ce6ce0..1e19e02819 100644 --- a/finality-aleph/src/metrics/mod.rs +++ b/finality-aleph/src/metrics/mod.rs @@ -1,9 +1,11 @@ +mod abft_score; mod best_block; mod finality_rate; mod slo; mod timing; pub mod transaction_pool; +pub use abft_score::ScoreMetrics; pub use slo::{run_metrics_service, SloMetrics}; pub use timing::{Checkpoint, DefaultClock}; pub type TimingBlockMetrics = timing::TimingBlockMetrics; diff --git a/finality-aleph/src/nodes.rs b/finality-aleph/src/nodes.rs index 917febbbb7..92b06e70b1 100644 --- a/finality-aleph/src/nodes.rs +++ b/finality-aleph/src/nodes.rs @@ -21,7 +21,7 @@ use crate::{ crypto::AuthorityPen, finalization::AlephFinalizer, idx_to_account::ValidatorIndexToAccountIdConverterImpl, - metrics::{run_metrics_service, SloMetrics}, + metrics::{run_metrics_service, ScoreMetrics, SloMetrics}, network::{ address_cache::validator_address_cache_updater, session::{ConnectionManager, ConnectionManagerConfig}, @@ -146,6 +146,11 @@ where let chain_events = client.chain_status_notifier(); + let score_metrics = ScoreMetrics::new(registry.clone()).unwrap_or_else(|e| { + debug!(target: LOG_TARGET, "Failed to create metrics: {}.", e); + ScoreMetrics::noop() + }); + let slo_metrics = SloMetrics::new(registry.as_ref(), chain_status.clone()); let timing_metrics = slo_metrics.timing_metrics().clone(); @@ -199,7 +204,7 @@ where verifier.clone(), session_info.clone(), sync_io, - registry.clone(), + registry, slo_metrics, favourite_block_user_requests, ) { @@ -266,6 +271,7 @@ where spawn_handle, connection_manager, keystore, + score_metrics, ), session_info, }); diff --git a/finality-aleph/src/party/manager/mod.rs b/finality-aleph/src/party/manager/mod.rs index 82fdb4fcae..2cc552348f 100644 --- a/finality-aleph/src/party/manager/mod.rs +++ b/finality-aleph/src/party/manager/mod.rs @@ -21,7 +21,7 @@ use crate::{ }, crypto::{AuthorityPen, AuthorityVerifier}, data_io::{ChainTracker, DataStore, OrderedDataInterpreter, SubstrateChainInfoProvider}, - metrics::TimingBlockMetrics, + metrics::{ScoreMetrics, TimingBlockMetrics}, mpsc, network::{ data::{ @@ -32,6 +32,7 @@ use crate::{ }, party::{ backup::ABFTBackup, manager::aggregator::AggregatorVersion, traits::NodeSessionManager, + LOG_TARGET, }, sync::JustificationSubmissions, AuthorityId, BlockId, CurrentRmcNetworkData, Keychain, LegacyRmcNetworkData, NodeIndex, @@ -114,6 +115,7 @@ where spawn_handle: SpawnHandle, session_manager: SM, keystore: Arc, + score_metrics: ScoreMetrics, _phantom: PhantomData<(B, H)>, } @@ -146,6 +148,7 @@ where spawn_handle: SpawnHandle, session_manager: SM, keystore: Arc, + score_metrics: ScoreMetrics, ) -> Self { Self { client, @@ -161,6 +164,7 @@ where spawn_handle, session_manager, keystore, + score_metrics, _phantom: PhantomData, } } @@ -274,8 +278,12 @@ where self.verifier.clone(), session_boundaries.clone(), ); - let (abft_performance, abft_batch_handler) = - CurrentPerformanceService::new(n_members, ordered_data_interpreter); + let (abft_performance, abft_batch_handler) = CurrentPerformanceService::new( + node_id.into(), + n_members, + ordered_data_interpreter, + self.score_metrics.clone(), + ); let consensus_config = current_create_aleph_config(n_members, node_id, session_id, self.unit_creation_delay); let data_network = data_network.map(); @@ -325,7 +333,7 @@ where exit_rx: oneshot::Receiver<()>, backup: ABFTBackup, ) -> Subtasks { - debug!(target: "afa", "Authority task {:?}", session_id); + debug!(target: LOG_TARGET, "Authority task {:?}", session_id); let authority_verifier = AuthorityVerifier::new(authorities.to_vec()); let authority_pen = @@ -389,17 +397,17 @@ where { #[cfg(feature = "only_legacy")] _ if self.only_legacy() => { - info!(target: "aleph-party", "Running session with legacy-only AlephBFT version."); + info!(target: LOG_TARGET, "Running session with legacy-only AlephBFT version."); self.legacy_subtasks(params) } // The `as`es here should be removed, but this would require a pallet migration and I // am lazy. Ok(version) if version == CURRENT_VERSION as u32 => { - info!(target: "aleph-party", "Running session with AlephBFT version {}, which is current.", version); + info!(target: LOG_TARGET, "Running session with AlephBFT version {}, which is current.", version); self.current_subtasks(params) } Ok(version) if version == LEGACY_VERSION as u32 => { - info!(target: "aleph-party", "Running session with AlephBFT version {}, which is legacy.", version); + info!(target: LOG_TARGET, "Running session with AlephBFT version {}, which is legacy.", version); self.legacy_subtasks(params) } Ok(version) if version > CURRENT_VERSION as u32 => { @@ -408,8 +416,8 @@ where ) } Ok(version) => { - info!(target: "aleph-party", "Attempting to run session with too old version {}, likely because we are synchronizing old sessions for which we have keys. This will not work, but it doesn't matter.", version); - info!(target: "aleph-party", "Running session with AlephBFT version {}, which is legacy.", LEGACY_VERSION); + info!(target: LOG_TARGET, "Attempting to run session with too old version {}, likely because we are synchronizing old sessions for which we have keys. This will not work, but it doesn't matter.", version); + info!(target: LOG_TARGET, "Running session with AlephBFT version {}, which is legacy.", LEGACY_VERSION); self.legacy_subtasks(params) } _ => { @@ -461,7 +469,7 @@ where self.spawn_handle .spawn_essential("aleph/session_authority", async move { if subtasks.wait_completion().await.is_err() { - warn!(target: "aleph-party", "Authority subtasks failed."); + warn!(target: LOG_TARGET, "Authority subtasks failed."); } }), node_id, diff --git a/finality-aleph/src/party/mod.rs b/finality-aleph/src/party/mod.rs index 2bf012f9e6..9652e815d2 100644 --- a/finality-aleph/src/party/mod.rs +++ b/finality-aleph/src/party/mod.rs @@ -24,6 +24,8 @@ pub mod traits; #[cfg(test)] mod mocks; +const LOG_TARGET: &str = "aleph-party"; + pub(crate) struct ConsensusPartyParams { pub session_authorities: ReadOnlySessionMap, pub chain_state: CS, diff --git a/primitives/src/lib.rs b/primitives/src/lib.rs index 981258a347..59a0627d75 100644 --- a/primitives/src/lib.rs +++ b/primitives/src/lib.rs @@ -152,7 +152,7 @@ pub const DEFAULT_FINALITY_VERSION: Version = 0; pub const CURRENT_FINALITY_VERSION: u16 = LEGACY_FINALITY_VERSION + 1; /// Legacy version of abft. -pub const LEGACY_FINALITY_VERSION: u16 = 3; +pub const LEGACY_FINALITY_VERSION: u16 = 4; /// Percentage of validator performance that is treated as 100% performance pub const LENIENT_THRESHOLD: Perquintill = Perquintill::from_percent(90); diff --git a/scripts/run_nodes.sh b/scripts/run_nodes.sh index c830915d9c..30cd64ec1e 100755 --- a/scripts/run_nodes.sh +++ b/scripts/run_nodes.sh @@ -64,6 +64,8 @@ Usage: [-p|--base-path BASE_PATH] if specified, use given base path (keystore, db, AlephBFT backups) if not specified, base path is ./run-nodes-local + [--finality-version] + which finality version should be used, default = legacy [--dont-bootstrap] set if you don't want to bootstrap chain, ie generate keystore and chainspec [--dont-build] @@ -85,6 +87,7 @@ DONT_BOOTSTRAP=${DONT_BOOTSTRAP:-""} DONT_BUILD_ALEPH_NODE=${DONT_BUILD_ALEPH_NODE:-""} DONT_DELETE_DB=${DONT_DELETE_DB:-""} DONT_REMOVE_ABFT_BACKUPS=${DONT_REMOVE_ABFT_BACKUPS:-""} +FINALITY_VERSION=${FINALITY_VERSION:-"legacy"} while [[ $# -gt 0 ]]; do case "$1" in @@ -100,6 +103,10 @@ while [[ $# -gt 0 ]]; do BASE_PATH="$2" shift;shift ;; + --finality-version) + FINALITY_VERSION="$2" + shift;shift + ;; --dont-bootstrap) DONT_BOOTSTRAP="true" shift @@ -219,6 +226,9 @@ fi if ! command -v jq &> /dev/null; then error "jq could not be found on PATH!" fi +if [[ "${FINALITY_VERSION}" != "current" && "${FINALITY_VERSION}" != "legacy" ]]; then + error "Flag finality-version should be either current or legacy." +fi # ------------------- main script starts here ------------------------------ @@ -276,7 +286,8 @@ if [[ -z "${DONT_BOOTSTRAP}" ]]; then --account-ids "${all_account_ids_string}" \ --authorities-account-ids "${validator_ids_string}" \ --chain-type local > "${BASE_PATH}/chainspec.json" \ - --rich-account-ids "${all_account_ids_string}" + --rich-account-ids "${all_account_ids_string}" \ + --finality-version "${FINALITY_VERSION}" if [[ "${DONT_REMOVE_ABFT_BACKUPS}" == "true" ]]; then all_account_ids=(${validator_account_ids[@]} ${rpc_node_account_ids[@]})