fix(supervisor): publish full status (pid, port, uptime, restart_count, last_exit) via watch channel
Replace watch::Receiver<ServerState> on SupervisorHandle with watch::Receiver<Status>, a richer snapshot type that carries pid, port, uptime_secs, restart_count and last_exit. SupervisorTask maintains current_pid and publishes a fresh Status on every state transition; handlers.rs reads the full Status so list/status no longer return zeroed/None fields. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -13,5 +13,6 @@ pub use logs::{LogSink, RecordedLine, RingBuffer, RotatingLogWriter};
|
|||||||
pub use policy::{RestartDecision, decide};
|
pub use policy::{RestartDecision, decide};
|
||||||
pub use retry_window::RetryWindow;
|
pub use retry_window::RetryWindow;
|
||||||
pub use supervisor::{
|
pub use supervisor::{
|
||||||
RealSpawner, Spawner, StartAck, StopAck, SupervisorCmd, SupervisorHandle, SupervisorTask,
|
RealSpawner, Spawner, StartAck, Status, StopAck, SupervisorCmd, SupervisorHandle,
|
||||||
|
SupervisorTask,
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -42,11 +42,21 @@ pub enum StopAck {
|
|||||||
NotRunning,
|
NotRunning,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct Status {
|
||||||
|
pub state: ServerState,
|
||||||
|
pub pid: Option<u32>,
|
||||||
|
pub port: u16,
|
||||||
|
pub uptime_secs: Option<u64>,
|
||||||
|
pub restart_count: u32,
|
||||||
|
pub last_exit: Option<i32>,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct SupervisorHandle {
|
pub struct SupervisorHandle {
|
||||||
pub name: String,
|
pub name: String,
|
||||||
pub tx: mpsc::Sender<SupervisorCmd>,
|
pub tx: mpsc::Sender<SupervisorCmd>,
|
||||||
pub state: watch::Receiver<ServerState>,
|
pub status: watch::Receiver<Status>,
|
||||||
pub log_sink: LogSink,
|
pub log_sink: LogSink,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -60,13 +70,14 @@ pub struct SupervisorTask<S: Spawner> {
|
|||||||
cfg: ServerConfig,
|
cfg: ServerConfig,
|
||||||
log_sink: LogSink,
|
log_sink: LogSink,
|
||||||
spawner: S,
|
spawner: S,
|
||||||
state_tx: watch::Sender<ServerState>,
|
status_tx: watch::Sender<Status>,
|
||||||
cmd_rx: mpsc::Receiver<SupervisorCmd>,
|
cmd_rx: mpsc::Receiver<SupervisorCmd>,
|
||||||
backoff: Backoff,
|
backoff: Backoff,
|
||||||
retry_window: RetryWindow,
|
retry_window: RetryWindow,
|
||||||
restart_count: u32,
|
restart_count: u32,
|
||||||
last_exit: Option<i32>,
|
last_exit: Option<i32>,
|
||||||
started_at: Option<Instant>,
|
started_at: Option<Instant>,
|
||||||
|
current_pid: Option<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<S: Spawner> SupervisorTask<S> {
|
impl<S: Spawner> SupervisorTask<S> {
|
||||||
@@ -74,7 +85,7 @@ impl<S: Spawner> SupervisorTask<S> {
|
|||||||
cfg: ServerConfig,
|
cfg: ServerConfig,
|
||||||
log_sink: LogSink,
|
log_sink: LogSink,
|
||||||
spawner: S,
|
spawner: S,
|
||||||
state_tx: watch::Sender<ServerState>,
|
status_tx: watch::Sender<Status>,
|
||||||
cmd_rx: mpsc::Receiver<SupervisorCmd>,
|
cmd_rx: mpsc::Receiver<SupervisorCmd>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let backoff = Backoff::new(cfg.restart.backoff_initial, cfg.restart.backoff_max);
|
let backoff = Backoff::new(cfg.restart.backoff_initial, cfg.restart.backoff_max);
|
||||||
@@ -85,18 +96,28 @@ impl<S: Spawner> SupervisorTask<S> {
|
|||||||
cfg,
|
cfg,
|
||||||
log_sink,
|
log_sink,
|
||||||
spawner,
|
spawner,
|
||||||
state_tx,
|
status_tx,
|
||||||
cmd_rx,
|
cmd_rx,
|
||||||
backoff,
|
backoff,
|
||||||
retry_window,
|
retry_window,
|
||||||
restart_count: 0,
|
restart_count: 0,
|
||||||
last_exit: None,
|
last_exit: None,
|
||||||
started_at: None,
|
started_at: None,
|
||||||
|
current_pid: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn set_state(&self, s: ServerState) {
|
fn set_state(&mut self, s: ServerState) {
|
||||||
let _ = self.state_tx.send(s);
|
let uptime_secs = self.started_at.map(|t| t.elapsed().as_secs());
|
||||||
|
|
||||||
|
let _ = self.status_tx.send(Status {
|
||||||
|
state: s,
|
||||||
|
pid: self.current_pid,
|
||||||
|
port: self.cfg.port,
|
||||||
|
uptime_secs,
|
||||||
|
restart_count: self.restart_count,
|
||||||
|
last_exit: self.last_exit,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn run(mut self) {
|
pub async fn run(mut self) {
|
||||||
@@ -174,6 +195,7 @@ impl<S: Spawner> SupervisorTask<S> {
|
|||||||
child = None;
|
child = None;
|
||||||
|
|
||||||
self.last_exit = code;
|
self.last_exit = code;
|
||||||
|
self.current_pid = None;
|
||||||
|
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
|
|
||||||
@@ -221,6 +243,7 @@ impl<S: Spawner> SupervisorTask<S> {
|
|||||||
|
|
||||||
self.restart_count = self.restart_count.saturating_add(1);
|
self.restart_count = self.restart_count.saturating_add(1);
|
||||||
self.started_at = Some(Instant::now());
|
self.started_at = Some(Instant::now());
|
||||||
|
self.current_pid = Some(c.pid());
|
||||||
self.backoff.reset();
|
self.backoff.reset();
|
||||||
self.set_state(ServerState::Running);
|
self.set_state(ServerState::Running);
|
||||||
|
|
||||||
@@ -244,6 +267,7 @@ impl<S: Spawner> SupervisorTask<S> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
self.current_pid = None;
|
||||||
self.started_at = None;
|
self.started_at = None;
|
||||||
self.set_state(ServerState::Stopped);
|
self.set_state(ServerState::Stopped);
|
||||||
}
|
}
|
||||||
@@ -317,34 +341,40 @@ mod tests {
|
|||||||
LogSink::new(name.to_string(), writer, 1024)
|
LogSink::new(name.to_string(), writer, 1024)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for(rx: &mut watch::Receiver<ServerState>, want: ServerState) {
|
fn initial_status(cfg: &ServerConfig) -> Status {
|
||||||
|
Status {
|
||||||
|
state: ServerState::Stopped,
|
||||||
|
pid: None,
|
||||||
|
port: cfg.port,
|
||||||
|
uptime_secs: None,
|
||||||
|
restart_count: 0,
|
||||||
|
last_exit: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn wait_for(rx: &mut watch::Receiver<Status>, want: ServerState) {
|
||||||
let deadline = tokio::time::Instant::now() + Duration::from_secs(2);
|
let deadline = tokio::time::Instant::now() + Duration::from_secs(2);
|
||||||
loop {
|
loop {
|
||||||
if *rx.borrow() == want {
|
if rx.borrow().state == want {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
_ = rx.changed() => {}
|
_ = rx.changed() => {}
|
||||||
_ = tokio::time::sleep_until(deadline) => panic!("never reached {want:?}, last={:?}", *rx.borrow()),
|
_ = tokio::time::sleep_until(deadline) => panic!("never reached {want:?}, last={:?}", rx.borrow().state),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn start_runs_to_running_and_stop_to_stopped() {
|
async fn start_runs_to_running_and_stop_to_stopped() {
|
||||||
|
let cfg = cfg("x", RestartPolicy::Never, 5);
|
||||||
let (mock, mut ctl) = MockChild::new(1);
|
let (mock, mut ctl) = MockChild::new(1);
|
||||||
let queue = Arc::new(Mutex::new(vec![mock]));
|
let queue = Arc::new(Mutex::new(vec![mock]));
|
||||||
let spawner = QueueSpawner { queue };
|
let spawner = QueueSpawner { queue };
|
||||||
|
|
||||||
let (state_tx, mut state_rx) = watch::channel(ServerState::Stopped);
|
let (status_tx, mut status_rx) = watch::channel(initial_status(&cfg));
|
||||||
let (cmd_tx, cmd_rx) = mpsc::channel(8);
|
let (cmd_tx, cmd_rx) = mpsc::channel(8);
|
||||||
let task = SupervisorTask::new(
|
let task = SupervisorTask::new(cfg, sink("x"), spawner, status_tx, cmd_rx);
|
||||||
cfg("x", RestartPolicy::Never, 5),
|
|
||||||
sink("x"),
|
|
||||||
spawner,
|
|
||||||
state_tx,
|
|
||||||
cmd_rx,
|
|
||||||
);
|
|
||||||
let h = tokio::spawn(task.run());
|
let h = tokio::spawn(task.run());
|
||||||
|
|
||||||
let (ack_tx, ack_rx) = oneshot::channel();
|
let (ack_tx, ack_rx) = oneshot::channel();
|
||||||
@@ -353,10 +383,10 @@ mod tests {
|
|||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(ack_rx.await.unwrap(), StartAck::Started);
|
assert_eq!(ack_rx.await.unwrap(), StartAck::Started);
|
||||||
wait_for(&mut state_rx, ServerState::Running).await;
|
wait_for(&mut status_rx, ServerState::Running).await;
|
||||||
|
|
||||||
ctl.exit_tx.take().unwrap().send(Some(0)).unwrap();
|
ctl.exit_tx.take().unwrap().send(Some(0)).unwrap();
|
||||||
wait_for(&mut state_rx, ServerState::Stopped).await;
|
wait_for(&mut status_rx, ServerState::Stopped).await;
|
||||||
|
|
||||||
let (ack_tx, ack_rx) = oneshot::channel();
|
let (ack_tx, ack_rx) = oneshot::channel();
|
||||||
cmd_tx
|
cmd_tx
|
||||||
|
|||||||
@@ -163,14 +163,16 @@ async fn list(reg: &Registry) -> Result<Vec<ServerSummary>, ApiError> {
|
|||||||
let mut out = Vec::new();
|
let mut out = Vec::new();
|
||||||
|
|
||||||
for (name, entry) in reg.snapshot().await {
|
for (name, entry) in reg.snapshot().await {
|
||||||
|
let s = entry.handle.status.borrow();
|
||||||
|
|
||||||
out.push(ServerSummary {
|
out.push(ServerSummary {
|
||||||
name,
|
name,
|
||||||
state: *entry.handle.state.borrow(),
|
state: s.state,
|
||||||
pid: None,
|
pid: s.pid,
|
||||||
port: 0,
|
port: s.port,
|
||||||
uptime_secs: None,
|
uptime_secs: s.uptime_secs,
|
||||||
restart_count: 0,
|
restart_count: s.restart_count,
|
||||||
last_exit: None,
|
last_exit: s.last_exit,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -185,15 +187,17 @@ async fn status(reg: &Registry, name: &str) -> Result<StatusDetail, ApiError> {
|
|||||||
));
|
));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let s = entry.handle.status.borrow();
|
||||||
|
|
||||||
Ok(StatusDetail {
|
Ok(StatusDetail {
|
||||||
summary: ServerSummary {
|
summary: ServerSummary {
|
||||||
name: entry.handle.name.clone(),
|
name: entry.handle.name.clone(),
|
||||||
state: *entry.handle.state.borrow(),
|
state: s.state,
|
||||||
pid: None,
|
pid: s.pid,
|
||||||
port: 0,
|
port: s.port,
|
||||||
uptime_secs: None,
|
uptime_secs: s.uptime_secs,
|
||||||
restart_count: 0,
|
restart_count: s.restart_count,
|
||||||
last_exit: None,
|
last_exit: s.last_exit,
|
||||||
},
|
},
|
||||||
recent_transitions: Vec::new(),
|
recent_transitions: Vec::new(),
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use xy_ipc::{Connection, bind};
|
|||||||
use xy_protocol::{ServerConfig, ServerState, kdl_parse::load_all_configs};
|
use xy_protocol::{ServerConfig, ServerState, kdl_parse::load_all_configs};
|
||||||
use xy_supervisor::{
|
use xy_supervisor::{
|
||||||
logs::{LogSink, RotatingLogWriter},
|
logs::{LogSink, RotatingLogWriter},
|
||||||
supervisor::{RealSpawner, SupervisorCmd, SupervisorHandle, SupervisorTask},
|
supervisor::{RealSpawner, Status, SupervisorCmd, SupervisorHandle, SupervisorTask},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub mod handlers;
|
pub mod handlers;
|
||||||
@@ -39,19 +39,28 @@ pub fn spawn_supervisor(paths: &Paths, cfg: ServerConfig) -> Result<SupervisorHa
|
|||||||
|
|
||||||
let sink = LogSink::new(cfg.name.clone(), writer, RING_BUFFER_BYTES);
|
let sink = LogSink::new(cfg.name.clone(), writer, RING_BUFFER_BYTES);
|
||||||
|
|
||||||
let (state_tx, state_rx) = watch::channel(ServerState::Stopped);
|
let initial_status = Status {
|
||||||
|
state: ServerState::Stopped,
|
||||||
|
pid: None,
|
||||||
|
port: cfg.port,
|
||||||
|
uptime_secs: None,
|
||||||
|
restart_count: 0,
|
||||||
|
last_exit: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let (status_tx, status_rx) = watch::channel(initial_status);
|
||||||
let (cmd_tx, cmd_rx) = mpsc::channel(16);
|
let (cmd_tx, cmd_rx) = mpsc::channel(16);
|
||||||
|
|
||||||
let name = cfg.name.clone();
|
let name = cfg.name.clone();
|
||||||
|
|
||||||
let task = SupervisorTask::new(cfg, sink.clone(), RealSpawner, state_tx, cmd_rx);
|
let task = SupervisorTask::new(cfg, sink.clone(), RealSpawner, status_tx, cmd_rx);
|
||||||
|
|
||||||
tokio::spawn(task.run());
|
tokio::spawn(task.run());
|
||||||
|
|
||||||
Ok(SupervisorHandle {
|
Ok(SupervisorHandle {
|
||||||
name,
|
name,
|
||||||
tx: cmd_tx,
|
tx: cmd_tx,
|
||||||
state: state_rx,
|
status: status_rx,
|
||||||
log_sink: sink,
|
log_sink: sink,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user