Skip to content

Commit

Permalink
tappd: Fix watchdog issue
Browse files Browse the repository at this point in the history
  • Loading branch information
kvinwang committed Dec 18, 2024
1 parent 6e25422 commit 1498a53
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 3 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions basefiles/tappd.service
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Description=Tappd Service
After=network.target

[Service]
ExecStartPre=-/bin/rm -f /var/run/tappd.sock
ExecStart=/bin/tappd --watchdog
Restart=always
User=root
Expand Down
1 change: 1 addition & 0 deletions tappd/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,4 @@ sysinfo.workspace = true
default-net.workspace = true
rocket-vsock-listener.workspace = true
sd-notify.workspace = true
reqwest.workspace = true
8 changes: 8 additions & 0 deletions tappd/rpc/proto/tappd_rpc.proto
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,15 @@ message WorkerInfo {
string tcb_info = 4;
}

// The response to a WorkerInfo request
message WorkerVersion {
// Tappd version
string version = 1;
}

service Worker {
// Get worker info
rpc Info(google.protobuf.Empty) returns (WorkerInfo) {}
// Get tappd version
rpc Version(google.protobuf.Empty) returns (WorkerVersion) {}
}
25 changes: 23 additions & 2 deletions tappd/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,31 @@ async fn run_watchdog() {
let heatbeat_interval = heatbeat_interval.max(Duration::from_secs(1));
info!("Watchdog enabled, interval={watchdog_usec}us, heartbeat={heatbeat_interval:?}",);
let mut interval = tokio::time::interval(heatbeat_interval);

// Create HTTP client for health checks
let client = reqwest::Client::new();

loop {
interval.tick().await;
if let Err(err) = sd_notify(false, &[NotifyState::Watchdog]) {
error!("Failed to notify systemd: {err}");

// Perform health check
match client
.get("http://localhost:8090/prpc/Worker.Version")
.send()
.await
{
Ok(response) if response.status().is_success() => {
// Only notify systemd if health check passes
if let Err(err) = sd_notify(false, &[NotifyState::Watchdog]) {
error!("Failed to notify systemd: {err}");
}
}
Ok(response) => {
error!("Health check failed with status: {}", response.status());
}
Err(err) => {
error!("Health check request failed: {}", err);
}
}
}
}
Expand Down
8 changes: 7 additions & 1 deletion tappd/src/rpc_service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use serde_json::json;
use tappd_rpc::{
tappd_server::{TappdRpc, TappdServer},
worker_server::{WorkerRpc, WorkerServer},
DeriveKeyArgs, DeriveKeyResponse, TdxQuoteArgs, TdxQuoteResponse, WorkerInfo,
DeriveKeyArgs, DeriveKeyResponse, TdxQuoteArgs, TdxQuoteResponse, WorkerInfo, WorkerVersion,
};
use tdx_attest::eventlog::read_event_logs;

Expand Down Expand Up @@ -152,6 +152,12 @@ impl WorkerRpc for ExternalRpcHandler {
tcb_info,
})
}

async fn version(self) -> Result<WorkerVersion> {
Ok(WorkerVersion {
version: env!("CARGO_PKG_VERSION").to_string(),
})
}
}

impl RpcCall<AppState> for ExternalRpcHandler {
Expand Down

0 comments on commit 1498a53

Please sign in to comment.