feat(supervisor): supervisor task with state machine
One async task per managed server owns all state transitions via a tokio::select! loop over cmd_rx and wait_child. Includes RealSpawner and a smoke test covering the Start → Running → exit → Stopped → Shutdown happy path. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -5,9 +5,13 @@ pub mod child;
|
|||||||
pub mod logs;
|
pub mod logs;
|
||||||
pub mod policy;
|
pub mod policy;
|
||||||
pub mod retry_window;
|
pub mod retry_window;
|
||||||
|
pub mod supervisor;
|
||||||
|
|
||||||
pub use backoff::Backoff;
|
pub use backoff::Backoff;
|
||||||
pub use child::{ChildHandle, MockChild, MockChildController, RealChild, spawn_with_logs};
|
pub use child::{ChildHandle, MockChild, MockChildController, RealChild, spawn_with_logs};
|
||||||
pub use logs::{LogSink, RecordedLine, RingBuffer, RotatingLogWriter};
|
pub use logs::{LogSink, RecordedLine, RingBuffer, RotatingLogWriter};
|
||||||
pub use policy::{RestartDecision, decide};
|
pub use policy::{RestartDecision, decide};
|
||||||
pub use retry_window::RetryWindow;
|
pub use retry_window::RetryWindow;
|
||||||
|
pub use supervisor::{
|
||||||
|
RealSpawner, Spawner, StartAck, StopAck, SupervisorCmd, SupervisorHandle, SupervisorTask,
|
||||||
|
};
|
||||||
|
|||||||
@@ -135,11 +135,7 @@ pub struct LogSink {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl LogSink {
|
impl LogSink {
|
||||||
pub fn new(
|
pub fn new(server_name: String, writer: RotatingLogWriter, ring_capacity_bytes: usize) -> Self {
|
||||||
server_name: String,
|
|
||||||
writer: RotatingLogWriter,
|
|
||||||
ring_capacity_bytes: usize,
|
|
||||||
) -> Self {
|
|
||||||
let (tx, _) = broadcast::channel(LOG_BROADCAST_CAP);
|
let (tx, _) = broadcast::channel(LOG_BROADCAST_CAP);
|
||||||
Self {
|
Self {
|
||||||
server_name,
|
server_name,
|
||||||
@@ -183,8 +179,8 @@ fn now_unix_ms() -> u64 {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use tempfile::tempdir;
|
|
||||||
use std::io::Read;
|
use std::io::Read;
|
||||||
|
use tempfile::tempdir;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn writes_lines_with_tags() {
|
fn writes_lines_with_tags() {
|
||||||
@@ -194,10 +190,7 @@ mod tests {
|
|||||||
w.write_line("[out]", "hello").unwrap();
|
w.write_line("[out]", "hello").unwrap();
|
||||||
w.write_line("[err]", "boom").unwrap();
|
w.write_line("[err]", "boom").unwrap();
|
||||||
let mut s = String::new();
|
let mut s = String::new();
|
||||||
File::open(&base)
|
File::open(&base).unwrap().read_to_string(&mut s).unwrap();
|
||||||
.unwrap()
|
|
||||||
.read_to_string(&mut s)
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(s, "[out] hello\n[err] boom\n");
|
assert_eq!(s, "[out] hello\n[err] boom\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,369 @@
|
|||||||
|
use crate::{
|
||||||
|
backoff::Backoff,
|
||||||
|
child::ChildHandle,
|
||||||
|
logs::LogSink,
|
||||||
|
policy::{RestartDecision, decide},
|
||||||
|
retry_window::RetryWindow,
|
||||||
|
};
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
use tokio::sync::{mpsc, oneshot, watch};
|
||||||
|
use tokio::time::sleep;
|
||||||
|
use tracing::{debug, info, warn};
|
||||||
|
use xy_protocol::{ServerConfig, ServerState};
|
||||||
|
|
||||||
|
pub enum SupervisorCmd {
|
||||||
|
Start {
|
||||||
|
ack: oneshot::Sender<StartAck>,
|
||||||
|
},
|
||||||
|
Stop {
|
||||||
|
ack: oneshot::Sender<StopAck>,
|
||||||
|
},
|
||||||
|
Restart {
|
||||||
|
ack: oneshot::Sender<()>,
|
||||||
|
},
|
||||||
|
Reconfigure {
|
||||||
|
new: ServerConfig,
|
||||||
|
ack: oneshot::Sender<()>,
|
||||||
|
},
|
||||||
|
Shutdown {
|
||||||
|
ack: oneshot::Sender<()>,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
|
pub enum StartAck {
|
||||||
|
Started,
|
||||||
|
AlreadyRunning,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
|
pub enum StopAck {
|
||||||
|
Stopped,
|
||||||
|
NotRunning,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct SupervisorHandle {
|
||||||
|
pub name: String,
|
||||||
|
pub tx: mpsc::Sender<SupervisorCmd>,
|
||||||
|
pub state: watch::Receiver<ServerState>,
|
||||||
|
pub log_sink: LogSink,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
pub trait Spawner: Send + 'static {
|
||||||
|
type Child: ChildHandle;
|
||||||
|
async fn spawn(&self, cfg: &ServerConfig, sink: LogSink) -> std::io::Result<Self::Child>;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct SupervisorTask<S: Spawner> {
|
||||||
|
cfg: ServerConfig,
|
||||||
|
log_sink: LogSink,
|
||||||
|
spawner: S,
|
||||||
|
state_tx: watch::Sender<ServerState>,
|
||||||
|
cmd_rx: mpsc::Receiver<SupervisorCmd>,
|
||||||
|
backoff: Backoff,
|
||||||
|
retry_window: RetryWindow,
|
||||||
|
restart_count: u32,
|
||||||
|
last_exit: Option<i32>,
|
||||||
|
started_at: Option<Instant>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<S: Spawner> SupervisorTask<S> {
|
||||||
|
pub fn new(
|
||||||
|
cfg: ServerConfig,
|
||||||
|
log_sink: LogSink,
|
||||||
|
spawner: S,
|
||||||
|
state_tx: watch::Sender<ServerState>,
|
||||||
|
cmd_rx: mpsc::Receiver<SupervisorCmd>,
|
||||||
|
) -> Self {
|
||||||
|
let backoff = Backoff::new(cfg.restart.backoff_initial, cfg.restart.backoff_max);
|
||||||
|
let retry_window =
|
||||||
|
RetryWindow::new(Duration::from_secs(60), cfg.restart.max_retries_per_minute);
|
||||||
|
|
||||||
|
Self {
|
||||||
|
cfg,
|
||||||
|
log_sink,
|
||||||
|
spawner,
|
||||||
|
state_tx,
|
||||||
|
cmd_rx,
|
||||||
|
backoff,
|
||||||
|
retry_window,
|
||||||
|
restart_count: 0,
|
||||||
|
last_exit: None,
|
||||||
|
started_at: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_state(&self, s: ServerState) {
|
||||||
|
let _ = self.state_tx.send(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn run(mut self) {
|
||||||
|
let mut child: Option<S::Child> = None;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
tokio::select! {
|
||||||
|
cmd = self.cmd_rx.recv() => {
|
||||||
|
let Some(cmd) = cmd else { break; };
|
||||||
|
|
||||||
|
match cmd {
|
||||||
|
SupervisorCmd::Start { ack } => {
|
||||||
|
if child.is_some() {
|
||||||
|
let _ = ack.send(StartAck::AlreadyRunning);
|
||||||
|
} else {
|
||||||
|
match self.do_start().await {
|
||||||
|
Ok(c) => {
|
||||||
|
child = Some(c);
|
||||||
|
let _ = ack.send(StartAck::Started);
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
warn!(name = %self.cfg.name, error = %err, "spawn failed");
|
||||||
|
self.set_state(ServerState::Failed);
|
||||||
|
let _ = ack.send(StartAck::Started);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
SupervisorCmd::Stop { ack } => {
|
||||||
|
if let Some(c) = child.take() {
|
||||||
|
self.do_stop(c).await;
|
||||||
|
let _ = ack.send(StopAck::Stopped);
|
||||||
|
} else {
|
||||||
|
let _ = ack.send(StopAck::NotRunning);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
SupervisorCmd::Restart { ack } => {
|
||||||
|
if let Some(c) = child.take() {
|
||||||
|
self.set_state(ServerState::Restarting);
|
||||||
|
self.do_stop(c).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
match self.do_start().await {
|
||||||
|
Ok(c) => child = Some(c),
|
||||||
|
Err(err) => {
|
||||||
|
warn!(name = %self.cfg.name, error = %err, "restart spawn failed");
|
||||||
|
self.set_state(ServerState::Failed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let _ = ack.send(());
|
||||||
|
}
|
||||||
|
SupervisorCmd::Reconfigure { new, ack } => {
|
||||||
|
self.cfg = new;
|
||||||
|
self.backoff =
|
||||||
|
Backoff::new(self.cfg.restart.backoff_initial, self.cfg.restart.backoff_max);
|
||||||
|
self.retry_window = RetryWindow::new(
|
||||||
|
Duration::from_secs(60),
|
||||||
|
self.cfg.restart.max_retries_per_minute,
|
||||||
|
);
|
||||||
|
|
||||||
|
let _ = ack.send(());
|
||||||
|
}
|
||||||
|
SupervisorCmd::Shutdown { ack } => {
|
||||||
|
if let Some(c) = child.take() {
|
||||||
|
self.do_stop(c).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
let _ = ack.send(());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
code = wait_child(&mut child) => {
|
||||||
|
child = None;
|
||||||
|
|
||||||
|
self.last_exit = code;
|
||||||
|
|
||||||
|
let now = Instant::now();
|
||||||
|
|
||||||
|
self.retry_window.record(now);
|
||||||
|
|
||||||
|
let cap = self.retry_window.cap_reached(now);
|
||||||
|
let decision = decide(self.cfg.restart.policy, code, cap);
|
||||||
|
|
||||||
|
debug!(name = %self.cfg.name, ?code, ?decision, "child exited");
|
||||||
|
|
||||||
|
match decision {
|
||||||
|
RestartDecision::StayStopped => {
|
||||||
|
self.started_at = None;
|
||||||
|
self.set_state(ServerState::Stopped);
|
||||||
|
}
|
||||||
|
RestartDecision::MarkFailed => {
|
||||||
|
self.started_at = None;
|
||||||
|
self.set_state(ServerState::Failed);
|
||||||
|
}
|
||||||
|
RestartDecision::Restart => {
|
||||||
|
self.set_state(ServerState::Restarting);
|
||||||
|
|
||||||
|
let delay = self.backoff.next();
|
||||||
|
|
||||||
|
sleep(delay).await;
|
||||||
|
|
||||||
|
match self.do_start().await {
|
||||||
|
Ok(c) => child = Some(c),
|
||||||
|
Err(err) => {
|
||||||
|
warn!(name = %self.cfg.name, error = %err, "restart spawn failed");
|
||||||
|
self.set_state(ServerState::Failed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn do_start(&mut self) -> std::io::Result<S::Child> {
|
||||||
|
self.set_state(ServerState::Starting);
|
||||||
|
|
||||||
|
let c = self.spawner.spawn(&self.cfg, self.log_sink.clone()).await?;
|
||||||
|
|
||||||
|
self.restart_count = self.restart_count.saturating_add(1);
|
||||||
|
self.started_at = Some(Instant::now());
|
||||||
|
self.backoff.reset();
|
||||||
|
self.set_state(ServerState::Running);
|
||||||
|
|
||||||
|
info!(name = %self.cfg.name, pid = c.pid(), "started");
|
||||||
|
|
||||||
|
Ok(c)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn do_stop(&mut self, mut c: S::Child) {
|
||||||
|
self.set_state(ServerState::Stopping);
|
||||||
|
|
||||||
|
let _ = c.terminate();
|
||||||
|
|
||||||
|
let grace = self.cfg.stop.grace;
|
||||||
|
|
||||||
|
match tokio::time::timeout(grace, c.wait()).await {
|
||||||
|
Ok(_) => {}
|
||||||
|
Err(_) => {
|
||||||
|
let _ = c.kill();
|
||||||
|
let _ = c.wait().await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.started_at = None;
|
||||||
|
self.set_state(ServerState::Stopped);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn wait_child<C: ChildHandle>(slot: &mut Option<C>) -> Option<i32> {
|
||||||
|
match slot.as_mut() {
|
||||||
|
Some(c) => c.wait().await.ok().flatten(),
|
||||||
|
None => std::future::pending().await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct RealSpawner;
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl Spawner for RealSpawner {
|
||||||
|
type Child = crate::child::RealChild;
|
||||||
|
|
||||||
|
async fn spawn(&self, cfg: &ServerConfig, sink: LogSink) -> std::io::Result<Self::Child> {
|
||||||
|
crate::child::spawn_with_logs(cfg, sink)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::child::MockChild;
|
||||||
|
use crate::logs::{LogSink, RotatingLogWriter};
|
||||||
|
use std::sync::{Arc, Mutex};
|
||||||
|
use tempfile::tempdir;
|
||||||
|
use xy_protocol::{RestartConfig, RestartPolicy, StopConfig};
|
||||||
|
|
||||||
|
struct QueueSpawner {
|
||||||
|
queue: Arc<Mutex<Vec<MockChild>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl Spawner for QueueSpawner {
|
||||||
|
type Child = MockChild;
|
||||||
|
|
||||||
|
async fn spawn(&self, _cfg: &ServerConfig, _sink: LogSink) -> std::io::Result<MockChild> {
|
||||||
|
let mut q = self.queue.lock().unwrap();
|
||||||
|
Ok(q.remove(0))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn cfg(name: &str, policy: RestartPolicy, max_retries: u32) -> ServerConfig {
|
||||||
|
ServerConfig {
|
||||||
|
name: name.to_string(),
|
||||||
|
command: "/bin/true".into(),
|
||||||
|
args: vec![],
|
||||||
|
port: 1,
|
||||||
|
env: Default::default(),
|
||||||
|
working_dir: None,
|
||||||
|
restart: RestartConfig {
|
||||||
|
policy,
|
||||||
|
backoff_initial: Duration::from_millis(1),
|
||||||
|
backoff_max: Duration::from_millis(1),
|
||||||
|
max_retries_per_minute: max_retries,
|
||||||
|
},
|
||||||
|
stop: StopConfig {
|
||||||
|
grace: Duration::from_millis(50),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn sink(name: &str) -> LogSink {
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let writer = RotatingLogWriter::open(&dir.path().join("s.log"), 1024, 3).unwrap();
|
||||||
|
std::mem::forget(dir);
|
||||||
|
LogSink::new(name.to_string(), writer, 1024)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn wait_for(rx: &mut watch::Receiver<ServerState>, want: ServerState) {
|
||||||
|
let deadline = tokio::time::Instant::now() + Duration::from_secs(2);
|
||||||
|
loop {
|
||||||
|
if *rx.borrow() == want {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
tokio::select! {
|
||||||
|
_ = rx.changed() => {}
|
||||||
|
_ = tokio::time::sleep_until(deadline) => panic!("never reached {want:?}, last={:?}", *rx.borrow()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn start_runs_to_running_and_stop_to_stopped() {
|
||||||
|
let (mock, mut ctl) = MockChild::new(1);
|
||||||
|
let queue = Arc::new(Mutex::new(vec![mock]));
|
||||||
|
let spawner = QueueSpawner { queue };
|
||||||
|
|
||||||
|
let (state_tx, mut state_rx) = watch::channel(ServerState::Stopped);
|
||||||
|
let (cmd_tx, cmd_rx) = mpsc::channel(8);
|
||||||
|
let task = SupervisorTask::new(
|
||||||
|
cfg("x", RestartPolicy::Never, 5),
|
||||||
|
sink("x"),
|
||||||
|
spawner,
|
||||||
|
state_tx,
|
||||||
|
cmd_rx,
|
||||||
|
);
|
||||||
|
let h = tokio::spawn(task.run());
|
||||||
|
|
||||||
|
let (ack_tx, ack_rx) = oneshot::channel();
|
||||||
|
cmd_tx
|
||||||
|
.send(SupervisorCmd::Start { ack: ack_tx })
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(ack_rx.await.unwrap(), StartAck::Started);
|
||||||
|
wait_for(&mut state_rx, ServerState::Running).await;
|
||||||
|
|
||||||
|
ctl.exit_tx.take().unwrap().send(Some(0)).unwrap();
|
||||||
|
wait_for(&mut state_rx, ServerState::Stopped).await;
|
||||||
|
|
||||||
|
let (ack_tx, ack_rx) = oneshot::channel();
|
||||||
|
cmd_tx
|
||||||
|
.send(SupervisorCmd::Shutdown { ack: ack_tx })
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
ack_rx.await.unwrap();
|
||||||
|
h.await.unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user