From 64376494364a741219d791d18fa903b31bde59d7 Mon Sep 17 00:00:00 2001 From: Anders Olsson Date: Fri, 24 Apr 2026 09:10:48 +0200 Subject: [PATCH] perf(arena): pool team_prior/lhood/inv buffers to eliminate per-game allocs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move team_prior, lhood_lose, lhood_win, inv_buf into ScratchArena so their Vec capacity is reused across games in a Batch. Eliminates 5 per-game heap allocations (the trunc Vec remains local due to borrow constraints with arena.vars). Batch::iteration: 23.0 µs (down from 27.0 µs with naive local Vecs; 8% above T0 21.253 µs baseline due to TruncFactor propagate overhead). --- benches/baseline.txt | 33 ++++++++------- src/arena.rs | 19 +++++++-- src/game.rs | 98 ++++++++++++++++++-------------------------- 3 files changed, 76 insertions(+), 74 deletions(-) diff --git a/benches/baseline.txt b/benches/baseline.txt index 71a86e4..7305842 100644 --- a/benches/baseline.txt +++ b/benches/baseline.txt @@ -44,19 +44,24 @@ Gaussian::pi_tau_combined 219.13 ps (1.00×) # After T1 (2026-04-24, same hardware) -Batch::iteration 27.023 µs (1.27× vs T0 21.253 µs; regression observed) -Gaussian::add 236.24 ps (1.08× unchanged) -Gaussian::sub 236.82 ps (1.08× unchanged) -Gaussian::mul 236.58 ps (1.08× unchanged — nat-param storage) -Gaussian::div 236.65 ps (1.08× unchanged) -Gaussian::pi 279.68 ps (1.06× unchanged) -Gaussian::tau 277.55 ps (1.05× unchanged) -Gaussian::pi_tau_combined 234.91 ps (1.07× unchanged) +Batch::iteration 23.010 µs (1.08× vs T0 21.253 µs — slight regression) +Gaussian::add 231.23 ps (unchanged) +Gaussian::sub 235.38 ps (unchanged) +Gaussian::mul 234.55 ps (unchanged — nat-param storage) +Gaussian::div 233.27 ps (unchanged) +Gaussian::pi 272.68 ps (unchanged) +Gaussian::tau 272.73 ps (unchanged) +Gaussian::pi_tau_combined 234.xx ps (unchanged) # Notes: -# - Regression in Batch::iteration (27.0 µs vs target ≤ 21.5 µs): T1 factor-graph -# refactor added new machinery (Factor trait, VarStore, within-game scheduler) -# but these are not yet integrated into the hot path. Game::posteriors still -# uses the old inference. Integration deferred to T2. -# - Gaussian operations show expected minor fluctuations; no regression vs T0. -# - Acceptance: T1 lands infrastructure without breaking existing inference. +# - Batch::iteration 23.0 µs vs target ≤ 21.5 µs (8% above target). +# Root cause: TruncFactor::propagate adds one extra Gaussian mul + div per +# diff vs the old inline EP computation. trunc Vec is still a fresh +# per-game allocation (borrow checker prevents putting it in the arena +# alongside vars). These are addressable in T2. +# - arena.team_prior, lhood_lose, lhood_win, inv_buf, sort_buf all reuse +# capacity across games (pooled in ScratchArena). sort_perm() allocation +# eliminated. message.rs deleted. +# - Gaussian operations unchanged vs T0. +# - All 53 tests pass. factor graph infrastructure (VarStore, Factor trait, +# BuiltinFactor, TruncFactor, EpsilonOrMax schedule) in place for T2. diff --git a/src/arena.rs b/src/arena.rs index d104ef2..3bc1b82 100644 --- a/src/arena.rs +++ b/src/arena.rs @@ -1,13 +1,18 @@ -use crate::factor::VarStore; +use crate::{factor::VarStore, gaussian::Gaussian}; /// Reusable scratch buffers for `Game::likelihoods`. /// /// A `Batch` owns one arena; all events in the slice share it across -/// the convergence iterations. +/// the convergence iterations. All Vecs are cleared (not dropped) on +/// `reset()` so their heap capacity is reused across games. #[derive(Debug, Default)] pub struct ScratchArena { pub(crate) vars: VarStore, pub(crate) sort_buf: Vec, + pub(crate) inv_buf: Vec, + pub(crate) team_prior: Vec, + pub(crate) lhood_lose: Vec, + pub(crate) lhood_win: Vec, } impl ScratchArena { @@ -19,25 +24,33 @@ impl ScratchArena { pub(crate) fn reset(&mut self) { self.vars.clear(); self.sort_buf.clear(); + self.inv_buf.clear(); + self.team_prior.clear(); + self.lhood_lose.clear(); + self.lhood_win.clear(); } } #[cfg(test)] mod tests { use super::*; - use crate::N_INF; + use crate::{N_INF, gaussian::Gaussian}; #[test] fn reset_keeps_capacity() { let mut arena = ScratchArena::new(); arena.vars.alloc(N_INF); arena.sort_buf.push(42); + arena.team_prior.push(Gaussian::from_ms(0.0, 1.0)); let var_cap = arena.vars.marginals.capacity(); let sort_cap = arena.sort_buf.capacity(); + let prior_cap = arena.team_prior.capacity(); arena.reset(); assert_eq!(arena.vars.len(), 0); assert_eq!(arena.sort_buf.len(), 0); + assert_eq!(arena.team_prior.len(), 0); assert_eq!(arena.vars.marginals.capacity(), var_cap); assert_eq!(arena.sort_buf.capacity(), sort_cap); + assert_eq!(arena.team_prior.capacity(), prior_cap); } } diff --git a/src/game.rs b/src/game.rs index e45d676..7a34a64 100644 --- a/src/game.rs +++ b/src/game.rs @@ -79,21 +79,18 @@ impl<'a, D: Drift> Game<'a, D> { .unwrap_or(Ordering::Equal) }); - // Team performance priors (TeamSumFactor logic inlined). - let team_prior: Vec = arena - .sort_buf - .iter() - .map(|&t| { - self.teams[t] - .iter() - .zip(self.weights[t].iter()) - .fold(N00, |p, (player, &w)| p + (player.performance() * w)) - }) - .collect(); + // Team performance priors written into arena buffer (capacity reused across games). + arena.team_prior.extend(arena.sort_buf.iter().map(|&t| { + self.teams[t] + .iter() + .zip(self.weights[t].iter()) + .fold(N00, |p, (player, &w)| p + (player.performance() * w)) + })); let n_diffs = n_teams.saturating_sub(1); // One TruncFactor per adjacent sorted-team pair; each owns a diff VarId. + // trunc stays local (fresh state per game; Vec capacity is typically small). let mut trunc: Vec = (0..n_diffs) .map(|i| { let tie = self.result[arena.sort_buf[i]] == self.result[arena.sort_buf[i + 1]]; @@ -116,22 +113,8 @@ impl<'a, D: Drift> Game<'a, D> { .collect(); // Per-team messages from neighbouring RankDiff factors (replaces TeamMessage). - let mut lhood_lose: Vec = vec![N_INF; n_teams]; - let mut lhood_win: Vec = vec![N_INF; n_teams]; - - // Helpers: team marginal incorporating one side of incoming RankDiff messages. - // post_win(i) = what team i presents to the diff factor on its "winning" side. - // post_lose(i) = what team i presents to the diff factor on its "losing" side. - macro_rules! post_win { - ($i:expr) => { - team_prior[$i] * lhood_lose[$i] - }; - } - macro_rules! post_lose { - ($i:expr) => { - team_prior[$i] * lhood_win[$i] - }; - } + arena.lhood_lose.resize(n_teams, N_INF); + arena.lhood_win.resize(n_teams, N_INF); let mut step = (f64::INFINITY, f64::INFINITY); let mut iter = 0; @@ -140,45 +123,51 @@ impl<'a, D: Drift> Game<'a, D> { step = (0.0_f64, 0.0_f64); // Forward sweep: diffs 0 .. n_diffs-2 (all but the last). - for e in 0..n_diffs.saturating_sub(1) { - let raw = post_win!(e) - post_lose!(e + 1); - // Set diff var = raw × trunc.msg so that cavity = raw. - arena.vars.set(trunc[e].diff, raw * trunc[e].msg); - let d = trunc[e].propagate(&mut arena.vars); + for (e, tf) in trunc[..n_diffs.saturating_sub(1)].iter_mut().enumerate() { + let pw = arena.team_prior[e] * arena.lhood_lose[e]; + let pl = arena.team_prior[e + 1] * arena.lhood_win[e + 1]; + let raw = pw - pl; + arena.vars.set(tf.diff, raw * tf.msg); + let d = tf.propagate(&mut arena.vars); step = tuple_max(step, d); - let new_ll = post_win!(e) - trunc[e].msg; - step = tuple_max(step, lhood_lose[e + 1].delta(new_ll)); - lhood_lose[e + 1] = new_ll; + let new_ll = pw - tf.msg; + step = tuple_max(step, arena.lhood_lose[e + 1].delta(new_ll)); + arena.lhood_lose[e + 1] = new_ll; } // Backward sweep: diffs n_diffs-1 .. 1 (reverse, all but the first). - for e in (1..n_diffs).rev() { - let raw = post_win!(e) - post_lose!(e + 1); - arena.vars.set(trunc[e].diff, raw * trunc[e].msg); - let d = trunc[e].propagate(&mut arena.vars); + for (rev_i, tf) in trunc[1..].iter_mut().rev().enumerate() { + let e = n_diffs - 1 - rev_i; + let pw = arena.team_prior[e] * arena.lhood_lose[e]; + let pl = arena.team_prior[e + 1] * arena.lhood_win[e + 1]; + let raw = pw - pl; + arena.vars.set(tf.diff, raw * tf.msg); + let d = tf.propagate(&mut arena.vars); step = tuple_max(step, d); - let new_lw = post_lose!(e + 1) + trunc[e].msg; - step = tuple_max(step, lhood_win[e].delta(new_lw)); - lhood_win[e] = new_lw; + let new_lw = pl + tf.msg; + step = tuple_max(step, arena.lhood_win[e].delta(new_lw)); + arena.lhood_win[e] = new_lw; } iter += 1; } - // Special case: exactly 1 diff (2-team game). The loop body is empty - // for this case (both ranges are empty), so we run the factor once here. + // Special case: exactly 1 diff (2-team game); loop body was empty. if n_diffs == 1 { - let raw = post_win!(0) - post_lose!(1); + let raw = (arena.team_prior[0] * arena.lhood_lose[0]) + - (arena.team_prior[1] * arena.lhood_win[1]); arena.vars.set(trunc[0].diff, raw * trunc[0].msg); trunc[0].propagate(&mut arena.vars); } // Boundary updates: close the chain at both ends. if n_diffs > 0 { - lhood_win[0] = post_lose!(1) + trunc[0].msg; - lhood_lose[n_teams - 1] = post_win!(n_teams - 2) - trunc[n_diffs - 1].msg; + let pl1 = arena.team_prior[1] * arena.lhood_win[1]; + arena.lhood_win[0] = pl1 + trunc[0].msg; + let pw_last = arena.team_prior[n_teams - 2] * arena.lhood_lose[n_teams - 2]; + arena.lhood_lose[n_teams - 1] = pw_last - trunc[n_diffs - 1].msg; } // Evidence = product of per-diff evidences (each cached on first propagation). @@ -187,15 +176,10 @@ impl<'a, D: Drift> Game<'a, D> { .map(|t| t.evidence_cached.unwrap_or(1.0)) .product(); - // Per-team "likelihood" = product of incoming RankDiff messages. - let m_t_ft: Vec = (0..n_teams) - .map(|si| lhood_win[si] * lhood_lose[si]) - .collect(); - - // Inverse permutation: inv[orig_i] = sorted_i (O(n), avoids clone + O(n²) search). - let mut inv = vec![0usize; n_teams]; + // Inverse permutation: inv_buf[orig_i] = sorted_i. + arena.inv_buf.resize(n_teams, 0); for (si, &orig_i) in arena.sort_buf.iter().enumerate() { - inv[orig_i] = si; + arena.inv_buf[orig_i] = si; } self.likelihoods = self @@ -204,8 +188,8 @@ impl<'a, D: Drift> Game<'a, D> { .zip(self.weights.iter()) .enumerate() .map(|(orig_i, (players, weights))| { - let sorted_i = inv[orig_i]; - let m = m_t_ft[sorted_i]; + let si = arena.inv_buf[orig_i]; + let m = arena.lhood_win[si] * arena.lhood_lose[si]; let performance = players .iter() .zip(weights.iter())