oxidecomputer · iliana · Feb 7, 2025 · Feb 6, 2025 · Feb 7, 2025 · Feb 7, 2025
diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs
@@ -748,16 +748,7 @@ impl From<ByteCount> for i64 {
 // store values greater than that as negative values, but surely 2**63 is
 // enough.)
 #[derive(
-    Copy,
-    Clone,
-    Debug,
-    Eq,
-    Hash,
-    JsonSchema,
-    Ord,
-    PartialEq,
-    PartialOrd,
-    Serialize,
+    Copy, Clone, Debug, Eq, Hash, JsonSchema, Ord, PartialEq, PartialOrd,
 )]
 pub struct Generation(u64);
 
@@ -808,6 +799,17 @@ impl<'de> Deserialize<'de> for Generation {
     }
 }
 
+// This is the equivalent of applying `#[serde(transparent)]`, but that has a
+// side effect of changing the JsonSchema derive to no longer emit a schema.
+impl Serialize for Generation {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        self.0.serialize(serializer)
+    }
+}
+
 impl Display for Generation {
     fn fmt(&self, f: &mut Formatter<'_>) -> FormatResult {
         f.write_str(&self.0.to_string())
@@ -863,6 +865,17 @@ impl FromStr for Generation {
     }
 }
 
+impl slog::Value for Generation {
+    fn serialize(
+        &self,
+        _rec: &slog::Record,
+        key: slog::Key,
+        serializer: &mut dyn slog::Serializer,
+    ) -> slog::Result {
+        serializer.emit_u64(key, self.0)
+    }
+}
+
 #[derive(Debug, thiserror::Error)]
 #[error("negative generation number")]
 pub struct GenerationNegativeError(());

diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs
@@ -2152,26 +2152,26 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) {
 fn print_task_tuf_artifact_replication(details: &serde_json::Value) {
     fn print_counters(counters: TufArtifactReplicationCounters) {
         const ROWS: &[&str] = &[
+            "put config ok:",
+            "put config err:",
             "list ok:",
             "list err:",
             "put ok:",
             "put err:",
             "copy ok:",
             "copy err:",
-            "delete ok:",
-            "delete err:",
         ];
         const WIDTH: usize = const_max_len(ROWS);
 
         for (label, value) in ROWS.iter().zip([
+            counters.put_config_ok,
+            counters.put_config_err,
             counters.list_ok,
             counters.list_err,
             counters.put_ok,
             counters.put_err,
             counters.copy_ok,
             counters.copy_err,
-            counters.delete_ok,
-            counters.delete_err,
         ]) {
             println!("      {label:<WIDTH$} {value:>3}");
         }

diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out
@@ -730,23 +730,23 @@ task: "tuf_artifact_replication"
     request ringbuf:
       <REDACTED_SECTION>
     last run:
-      list ok:      <LIST_OK_REDACTED>
-      list err:     0
-      put ok:       0
-      put err:      0
-      copy ok:      0
-      copy err:     0
-      delete ok:    0
-      delete err:   0
+      put config ok:    <PUT_CONFIG_OK_REDACTED>
+      put config err:   0
+      list ok:          <LIST_OK_REDACTED>
+      list err:         0
+      put ok:           0
+      put err:          0
+      copy ok:          0
+      copy err:         0
     lifetime:
-      list ok:      <LIST_OK_REDACTED>
-      list err:     0
-      put ok:       0
-      put err:      0
-      copy ok:      0
-      copy err:     0
-      delete ok:    0
-      delete err:   0
+      put config ok:    <PUT_CONFIG_OK_REDACTED>
+      put config err:   0
+      list ok:          <LIST_OK_REDACTED>
+      list err:         0
+      put ok:           0
+      put err:          0
+      copy ok:          0
+      copy err:         0
     local repos: 0
 
 task: "v2p_manager"
@@ -1221,23 +1221,23 @@ task: "tuf_artifact_replication"
     request ringbuf:
       <REDACTED_SECTION>
     last run:
-      list ok:      <LIST_OK_REDACTED>
-      list err:     0
-      put ok:       0
-      put err:      0
-      copy ok:      0
-      copy err:     0
-      delete ok:    0
-      delete err:   0
+      put config ok:    <PUT_CONFIG_OK_REDACTED>
+      put config err:   0
+      list ok:          <LIST_OK_REDACTED>
+      list err:         0
+      put ok:           0
+      put err:          0
+      copy ok:          0
+      copy err:         0
     lifetime:
-      list ok:      <LIST_OK_REDACTED>
-      list err:     0
-      put ok:       0
-      put err:      0
-      copy ok:      0
-      copy err:     0
-      delete ok:    0
-      delete err:   0
+      put config ok:    <PUT_CONFIG_OK_REDACTED>
+      put config err:   0
+      list ok:          <LIST_OK_REDACTED>
+      list err:         0
+      put ok:           0
+      put err:          0
+      copy ok:          0
+      copy err:         0
     local repos: 0
 
 task: "v2p_manager"

diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs
@@ -227,6 +227,7 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) {
     // execution. These redactions work around the issue described in
     // https://github.com/oxidecomputer/omicron/issues/7417.
     redactor
+        .field("put config ok:", r"\d+")
         .field("list ok:", r"\d+")
         .section(&["task: \"tuf_artifact_replication\"", "request ringbuf:"]);
 

diff --git a/docs/tuf-artifact-replication.adoc b/docs/tuf-artifact-replication.adoc
@@ -0,0 +1,185 @@
+:showtitle:
+:numbered:
+
+= TUF Artifact Replication (a.k.a. TUF Repo Depot)
+
+Our release artifact is a TUF repo consisting of all of the artifacts
+the product requires to run. For the update system to work, it needs
+access to those artifacts. There are some constraining factors:
+
+* Nexus is the only way into the system for these artifacts (either
+  through direct upload from an operator, or a download initiated by
+  Nexus to a service outside of the system).
+* Nexus has no persistent local storage, nor can it directly use the
+  artifacts (OS and zone images, firmware, etc.) even if it did store
+  them.
+* Sled Agent is generally what will directly use the artifacts (except
+  for SP and ROT images, which MGS needs), and it can also manage its
+  own local storage.
+
+Thus Nexus needs to accept artifacts from outside of the system and
+immediately offload them to individual Sled Agents for persistent
+storage and later use.
+
+We have chosen (see <<rfd424>>) the simplest possible implementation:
+every Sled Agent stores a copy of every artifact on each of its M.2
+devices. This is storage inefficient but means that a Sled Agent can
+directly use those resources to create zones from updated images,
+install an updated OS, or manage the installation of updates on other
+components, without Nexus having to ensure that it tells a sled to
+have an artifact _before_ telling it to use it. A Nexus background task
+periodically ensures that all sleds have all artifacts.
+
+== Sled Agent implementation
+
+Sled Agent stores artifacts as a content-addressed store on an *update*
+dataset on each M.2 device: the file name of each stored artifact is its
+SHA-256 hash.
+
+It also stores an _artifact configuration_ in memory: a list of all
+artifact hashes that the sled should store, and a generation number.
+The generation number is owned by Nexus, which increments the generation
+number when the set of TUF repos on the system changes. Sled Agent
+prevents modifying the configuration without an increase in the
+generation number.
+
+Sled Agent offers the following APIs on the underlay network, intended
+for Nexus:
+
+* `artifact_config_get`: Get the current artifact configuration.
+* `artifact_config_put`: Put the artifact configuration that should be
+  in effect. This API is idempotent (putting the same configuration does
+  not change anything). Modified configurations must also increase the
+  generation number.
+* `artifact_list`: List the artifacts present in the artifact
+  configuration along with the count of available copies of each
+  artifact across the *update* datasets. Also includes the current
+  generation number.
+* `artifact_put`: Put the request body into the artifact store.
+  Rejects the request if the artifact does not belong to the current
+  configuration.
+* `artifact_copy_from_depot`: Sends a request to another Sled Agent (via
+  the *TUF Repo Depot API*; see below) to fetch an artifact. The base
+  URL for the source sled is chosen by the requester. This API responds
+  after a successful HTTP response from the source sled and the copy
+  proceeds asynchronously. Rejects the request if the artifact does not
+  belong to the current configuration.
+
+Sled Agent also spawns another Dropshot API server called the *TUF Repo
+Depot API* which offers one API on the underlay network, intended for
+other Sled Agents:
+
+* `artifact_get_by_sha256`: Get the content of an artifact.
+
+In an asynchronous task called the _delete reconciler_, Sled Agent
+periodically scans the *update* datasets for artifacts that are not
+part of the present configuration and deletes them. Prior to each
+filesystem operation the task checks the configuration for presence of
+that artifact hash. The delete reconciler then waits for an artifact
+configuration change until running again.
+
+== Nexus implementation
+
+Nexus has a `tuf_artifact_replication` background task which runs this
+reliable persistent workflow:
+
+1. Collect the artifact configuration (the list of artifact hashes, and
+   the current generation number) from the database.
+2. Call `artifact_config_put` on all sleds. Stop if any sled rejects the
+   configuration (our information is already out of date).
+3. Call `artifact_list` on all sleds. Stop if any sled informs us of a
+   different generation number.
+4. Delete any local copies of repositories where all artifacts are
+   sufficiently replicated across sleds. ("Sufficiently replicated"
+   currently means that at least 3 sleds each have at least one copy.)
+5. For any artifacts this Nexus has a local copy of, send `artifact_put`
+   requests to N random sleds, where N is the number of puts required to
+   sufficienty replicate the artifact.
+6. Send `artifact_copy_from_depot` requests to all remaining sleds
+   missing copies of an artifact. Nexus chooses the source sled randomly
+   out of the list of sleds that have a copy of the artifact.
+
+In each task execution, Nexus will attempt to do all possible work
+that leads to every sled having a copy of the artifact. In the absence
+of random I/O errors, a repository will be fully replicated across
+all sleds in the system in the first execution, and the Nexus-local
+copy of the repository will be deleted in the second execution.
+`artifact_copy_from_depot` requests that require the presence of an
+artifact on a sled that does not yet have it are scheduled after all
+`artifact_put` requests complete.
+
+== Preventing conflicts and loss of artifacts
+
+The artifact configuration is used to prevent conflicts that may be
+caused by two Nexus instances running the `tuf_artifact_replication`
+background task simultaneously with different information. The worst
+case scenario for a conflict is the total loss of an artifact across the
+system, although there are lesser evils as well. This section describes
+a number of possible faults and the mitigations taken.
+
+=== Recently-uploaded repositories and artifact deletion
+
+When Sled Agent receives an artifact configuration change, the delete
+reconciler task begins scanning the *update* datasets for artifacts that
+are no longer required and deletes them.
+
+Nexus maintains its local copy of recently-uploaded repositories
+until it confirms (via the `artifact_list` operation) that all of the
+artifacts in the repository are sufficiently replicated (currently, at
+least 3 sleds each have at least 1 copy).
+
+If the `artifact_list` operation lists any artifacts that could be
+deleted asynchronously, Nexus could incorrectly assume that an artifact
+is sufficiently replicated when it is not. This could happen if a
+repository is deleted, and another repository containing the same
+artifact is uploaded while another Nexus is running the background task.
+
+The artifact configuration is designed to mitigate this. The
+`artifact_list` operation filters the list of artifacts to contain
+only artifacts present in the current configuration. The delete
+reconciler decides whether to delete a file by re-checking the current
+configuration.
+
+When Nexus receives the `artifact_list` response, it verifies that
+the generation number reported is the same as the configuration it put
+earlier in the same task execution. Because the response only contains
+artifacts belonging to the current configuration, and that list of
+artifacts is based on the same configuration Nexus believes is current,
+it can trust that none of those artifacts are about to be deleted and
+safely delete local copies of sufficiently-replicated artifacts.
+
+=== Loss of all sleds with the only copy
+
+There are two potential situations where we could lose the only copy of
+an artifact. The first is a Nexus instance crashing or being replaced
+before a local artifact can be put to any sleds. Crashes are difficult
+to mitigate, as artifacts are currently stored in randomly-named
+temporary directories that are non-trivial to recover on startup;
+consequently there is no mitigation for this problem today. During
+graceful removal of Nexus zones, a quiesced Nexus (see <<rfd459>> and
+<<omicron5677>>) should remain alive until all local artifacts are
+sufficiently replicated.
+
+The second potential situation is a loss of all sleds that an artifact
+is copied to after Nexus deletes its local copy. This is mostly
+mitigated by Nexus attempting to fully replicate all artifacts onto
+all sleds in every execution of the background task; if there are no
+I/O errors, it only takes one task execution to ensure a repository is
+present across the entire system.
+
+=== Unnecessary work
+
+`artifact_put` and `artifact_copy_from_depot` requests include the
+current generation as a query string parameter. If the generation does
+not match the current configuration, or the artifact is not present in
+the configuration, Sled Agent rejects the request.
+
+[bibliography]
+== References
+
+* [[[rfd424]]] Oxide Computer Company.
+  https://rfd.shared.oxide.computer/rfd/424[TUF Repo Depot].
+* [[[rfd459]]] Oxide Computer Company.
+  https://rfd.shared.oxide.computer/rfd/424[Control plane component lifecycle].
+* [[[omicron5677]]] oxidecomputer/omicron.
+  https://github.com/oxidecomputer/omicron/issues/5677[nexus 'quiesce' support].
diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs
@@ -1369,6 +1369,13 @@ allow_tables_to_appear_in_same_query!(
 joinable!(tuf_repo_artifact -> tuf_repo (tuf_repo_id));
 joinable!(tuf_repo_artifact -> tuf_artifact (tuf_artifact_id));
 
+table! {
+    tuf_generation (singleton) {
+        singleton -> Bool,
+        generation -> Int8,
+    }
+}
+
 table! {
     support_bundle {
         id -> Uuid,

diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs
@@ -17,7 +17,7 @@ use std::collections::BTreeMap;
 ///
 /// This must be updated when you change the database schema.  Refer to
 /// schema/crdb/README.adoc in the root of this repository for details.
-pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(123, 0, 0);
+pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(124, 0, 0);
 
 /// List of all past database schema versions, in *reverse* order
 ///
@@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy<Vec<KnownVersion>> = Lazy::new(|| {
         // |  leaving the first copy as an example for the next person.
         // v
         // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
+        KnownVersion::new(124, "tuf-generation"),
         KnownVersion::new(123, "vpc-subnet-contention"),
         KnownVersion::new(122, "tuf-artifact-replication"),
         KnownVersion::new(121, "dataset-to-crucible-dataset"),