Skip to content

Commit

Permalink
Region snapshot replacement for read-only regions (#7435)
Browse files Browse the repository at this point in the history
Reuse the region snapshot replacement machinery to replace read-only
regions. This is done by storing a replacement type in the region
snapshot replacement record such that either a region snapshot _or_ a
read-only region can be the subject of this type of replacement. The
procedure for both types is the same so all the code can be reused.

A future commit will rename region snapshot replacement (and all
references) to "read-only target replacement" to reflect that the
machinery now applies to both region snapshots and read-only regions.
This will be a mostly mechanical set of changes that can be reviewed
separately with much less scrutiny. Right now manually requesting a
region replacement with omdb is done through the region replacement
manual request, not the region snapshot replacement manual request. This
will change in that future commit to be part of a read-only target
replacement request.

Fixes #6172
  • Loading branch information
jmpesp authored Feb 12, 2025
1 parent 798b276 commit a45a089
Show file tree
Hide file tree
Showing 41 changed files with 2,130 additions and 490 deletions.
59 changes: 45 additions & 14 deletions dev-tools/omdb/src/bin/omdb/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ use nexus_db_model::NetworkInterfaceKind;
use nexus_db_model::PhysicalDisk;
use nexus_db_model::Probe;
use nexus_db_model::Project;
use nexus_db_model::ReadOnlyTargetReplacement;
use nexus_db_model::Region;
use nexus_db_model::RegionReplacement;
use nexus_db_model::RegionReplacementState;
Expand Down Expand Up @@ -3068,11 +3069,19 @@ async fn cmd_db_region_replacement_request(
) -> Result<(), anyhow::Error> {
let region = datastore.get_region(args.region_id).await?;

let request_id = datastore
.create_region_replacement_request_for_region(opctx, &region)
.await?;
if region.read_only() {
let request_id = datastore
.create_read_only_region_replacement_request(opctx, region.id())
.await?;

println!("region snapshot replacement {request_id} created");
} else {
let request_id = datastore
.create_region_replacement_request_for_region(opctx, &region)
.await?;

println!("region replacement {request_id} created");
println!("region replacement {request_id} created");
}

Ok(())
}
Expand Down Expand Up @@ -4448,12 +4457,22 @@ async fn cmd_db_region_snapshot_replacement_status(
" state: {:?}",
request.replacement_state
);
println!(
" region snapshot: {} {} {}",
request.old_dataset_id,
request.old_region_id,
request.old_snapshot_id,
);
match request.replacement_type() {
ReadOnlyTargetReplacement::RegionSnapshot {
dataset_id,
region_id,
snapshot_id,
} => {
println!(
" region snapshot: {} {} {}",
dataset_id, region_id, snapshot_id,
);
}

ReadOnlyTargetReplacement::ReadOnlyRegion { region_id } => {
println!(" read-only region: {}", region_id);
}
}
println!(" new region id: {:?}", request.new_region_id);
println!(" in-progress steps left: {:?}", steps_left);
println!();
Expand Down Expand Up @@ -4485,10 +4504,22 @@ async fn cmd_db_region_snapshot_replacement_info(

println!(" started: {}", request.request_time);
println!(" state: {:?}", request.replacement_state);
println!(
" region snapshot: {} {} {}",
request.old_dataset_id, request.old_region_id, request.old_snapshot_id,
);
match request.replacement_type() {
ReadOnlyTargetReplacement::RegionSnapshot {
dataset_id,
region_id,
snapshot_id,
} => {
println!(
" region snapshot: {} {} {}",
dataset_id, region_id, snapshot_id,
);
}

ReadOnlyTargetReplacement::ReadOnlyRegion { region_id } => {
println!(" read-only region: {}", region_id);
}
}
println!(" new region id: {:?}", request.new_region_id);
println!(" in-progress steps left: {:?}", steps_left);
println!();
Expand Down
30 changes: 30 additions & 0 deletions dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ use nexus_types::internal_api::background::BlueprintRendezvousStatus;
use nexus_types::internal_api::background::InstanceReincarnationStatus;
use nexus_types::internal_api::background::InstanceUpdaterStatus;
use nexus_types::internal_api::background::LookupRegionPortStatus;
use nexus_types::internal_api::background::ReadOnlyRegionReplacementStartStatus;
use nexus_types::internal_api::background::RegionReplacementDriverStatus;
use nexus_types::internal_api::background::RegionReplacementStatus;
use nexus_types::internal_api::background::RegionSnapshotReplacementFinishStatus;
Expand Down Expand Up @@ -928,6 +929,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
"phantom_disks" => {
print_task_phantom_disks(details);
}
"read_only_region_replacement_start" => {
print_task_read_only_region_replacement_start(details);
}
"region_replacement" => {
print_task_region_replacement(details);
}
Expand Down Expand Up @@ -1724,6 +1728,32 @@ fn print_task_phantom_disks(details: &serde_json::Value) {
};
}

fn print_task_read_only_region_replacement_start(details: &serde_json::Value) {
match serde_json::from_value::<ReadOnlyRegionReplacementStartStatus>(
details.clone(),
) {
Err(error) => eprintln!(
"warning: failed to interpret task details: {:?}: {:?}",
error, details
),

Ok(status) => {
println!(
" total requests created ok: {}",
status.requests_created_ok.len(),
);
for line in &status.requests_created_ok {
println!(" > {line}");
}

println!(" errors: {}", status.errors.len());
for line in &status.errors {
println!(" > {line}");
}
}
}
}

fn print_task_region_replacement(details: &serde_json::Value) {
match serde_json::from_value::<RegionReplacementStatus>(details.clone()) {
Err(error) => eprintln!(
Expand Down
12 changes: 12 additions & 0 deletions dev-tools/omdb/tests/env.out
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,10 @@ task: "physical_disk_adoption"
ensure new physical disks are automatically marked in-service


task: "read_only_region_replacement_start"
detect if read-only regions need replacement and begin the process


task: "region_replacement"
detects if a region requires replacing and begins the process

Expand Down Expand Up @@ -313,6 +317,10 @@ task: "physical_disk_adoption"
ensure new physical disks are automatically marked in-service


task: "read_only_region_replacement_start"
detect if read-only regions need replacement and begin the process


task: "region_replacement"
detects if a region requires replacing and begins the process

Expand Down Expand Up @@ -484,6 +492,10 @@ task: "physical_disk_adoption"
ensure new physical disks are automatically marked in-service


task: "read_only_region_replacement_start"
detect if read-only regions need replacement and begin the process


task: "region_replacement"
detects if a region requires replacing and begins the process

Expand Down
20 changes: 20 additions & 0 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,10 @@ task: "physical_disk_adoption"
ensure new physical disks are automatically marked in-service


task: "read_only_region_replacement_start"
detect if read-only regions need replacement and begin the process


task: "region_replacement"
detects if a region requires replacing and begins the process

Expand Down Expand Up @@ -613,6 +617,14 @@ task: "physical_disk_adoption"
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
last completion reported error: task disabled

task: "read_only_region_replacement_start"
configured period: every <REDACTED_DURATION>m
currently executing: no
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
total requests created ok: 0
errors: 0

task: "region_replacement"
configured period: every <REDACTED_DURATION>m
currently executing: no
Expand Down Expand Up @@ -1104,6 +1116,14 @@ task: "physical_disk_adoption"
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
last completion reported error: task disabled

task: "read_only_region_replacement_start"
configured period: every <REDACTED_DURATION>m
currently executing: no
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
total requests created ok: 0
errors: 0

task: "region_replacement"
configured period: every <REDACTED_DURATION>m
currently executing: no
Expand Down
17 changes: 17 additions & 0 deletions nexus-config/src/nexus_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,9 @@ pub struct BackgroundTaskConfig {
RegionSnapshotReplacementFinishConfig,
/// configuration for TUF artifact replication task
pub tuf_artifact_replication: TufArtifactReplicationConfig,
/// configuration for read-only region replacement start task
pub read_only_region_replacement_start:
ReadOnlyRegionReplacementStartConfig,
}

#[serde_as]
Expand Down Expand Up @@ -735,6 +738,14 @@ pub struct TufArtifactReplicationConfig {
pub min_sled_replication: usize,
}

#[serde_as]
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
pub struct ReadOnlyRegionReplacementStartConfig {
/// period (in seconds) for periodic activations of this background task
#[serde_as(as = "DurationSeconds<u64>")]
pub period_secs: Duration,
}

/// Configuration for a nexus server
#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
pub struct PackageConfig {
Expand Down Expand Up @@ -993,6 +1004,7 @@ mod test {
region_snapshot_replacement_finish.period_secs = 30
tuf_artifact_replication.period_secs = 300
tuf_artifact_replication.min_sled_replication = 3
read_only_region_replacement_start.period_secs = 30
[default_region_allocation_strategy]
type = "random"
seed = 0
Expand Down Expand Up @@ -1194,6 +1206,10 @@ mod test {
period_secs: Duration::from_secs(300),
min_sled_replication: 3,
},
read_only_region_replacement_start:
ReadOnlyRegionReplacementStartConfig {
period_secs: Duration::from_secs(30),
},
},
default_region_allocation_strategy:
crate::nexus_config::RegionAllocationStrategy::Random {
Expand Down Expand Up @@ -1279,6 +1295,7 @@ mod test {
region_snapshot_replacement_finish.period_secs = 30
tuf_artifact_replication.period_secs = 300
tuf_artifact_replication.min_sled_replication = 3
read_only_region_replacement_start.period_secs = 30
[default_region_allocation_strategy]
type = "random"
"##,
Expand Down
Loading

0 comments on commit a45a089

Please sign in to comment.