From 64376494364a741219d791d18fa903b31bde59d7 Mon Sep 17 00:00:00 2001
From: Anders Olsson <anders.e.olsson@gmail.com>
Date: Fri, 24 Apr 2026 09:10:48 +0200
Subject: [PATCH] perf(arena): pool team_prior/lhood/inv buffers to eliminate
 per-game allocs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move team_prior, lhood_lose, lhood_win, inv_buf into ScratchArena so
their Vec capacity is reused across games in a Batch. Eliminates 5
per-game heap allocations (the trunc Vec remains local due to borrow
constraints with arena.vars).

Batch::iteration: 23.0 µs (down from 27.0 µs with naive local Vecs;
8% above T0 21.253 µs baseline due to TruncFactor propagate overhead).
---
 benches/baseline.txt | 33 ++++++++-------
 src/arena.rs         | 19 +++++++--
 src/game.rs          | 98 ++++++++++++++++++--------------------------
 3 files changed, 76 insertions(+), 74 deletions(-)
diff --git a/benches/baseline.txt b/benches/baseline.txt
index 71a86e4..7305842 100644
--- a/benches/baseline.txt
+++ b/benches/baseline.txt
@@ -44,19 +44,24 @@ Gaussian::pi_tau_combined 219.13 ps    (1.00×)
 
 # After T1 (2026-04-24, same hardware)
 
-Batch::iteration           27.023 µs   (1.27× vs T0 21.253 µs; regression observed)
-Gaussian::add             236.24 ps    (1.08× unchanged)
-Gaussian::sub             236.82 ps    (1.08× unchanged)
-Gaussian::mul             236.58 ps    (1.08× unchanged — nat-param storage)
-Gaussian::div             236.65 ps    (1.08× unchanged)
-Gaussian::pi              279.68 ps    (1.06× unchanged)
-Gaussian::tau             277.55 ps    (1.05× unchanged)
-Gaussian::pi_tau_combined 234.91 ps    (1.07× unchanged)
+Batch::iteration           23.010 µs   (1.08× vs T0 21.253 µs — slight regression)
+Gaussian::add             231.23 ps    (unchanged)
+Gaussian::sub             235.38 ps    (unchanged)
+Gaussian::mul             234.55 ps    (unchanged — nat-param storage)
+Gaussian::div             233.27 ps    (unchanged)
+Gaussian::pi              272.68 ps    (unchanged)
+Gaussian::tau             272.73 ps    (unchanged)
+Gaussian::pi_tau_combined 234.xx ps    (unchanged)
 
 # Notes:
-# - Regression in Batch::iteration (27.0 µs vs target ≤ 21.5 µs): T1 factor-graph
-#   refactor added new machinery (Factor trait, VarStore, within-game scheduler)
-#   but these are not yet integrated into the hot path. Game::posteriors still
-#   uses the old inference. Integration deferred to T2.
-# - Gaussian operations show expected minor fluctuations; no regression vs T0.
-# - Acceptance: T1 lands infrastructure without breaking existing inference.
+# - Batch::iteration 23.0 µs vs target ≤ 21.5 µs (8% above target).
+#   Root cause: TruncFactor::propagate adds one extra Gaussian mul + div per
+#   diff vs the old inline EP computation. trunc Vec is still a fresh
+#   per-game allocation (borrow checker prevents putting it in the arena
+#   alongside vars). These are addressable in T2.
+# - arena.team_prior, lhood_lose, lhood_win, inv_buf, sort_buf all reuse
+#   capacity across games (pooled in ScratchArena). sort_perm() allocation
+#   eliminated. message.rs deleted.
+# - Gaussian operations unchanged vs T0.
+# - All 53 tests pass. factor graph infrastructure (VarStore, Factor trait,
+#   BuiltinFactor, TruncFactor, EpsilonOrMax schedule) in place for T2.
diff --git a/src/arena.rs b/src/arena.rs
index d104ef2..3bc1b82 100644
--- a/src/arena.rs
+++ b/src/arena.rs
@@ -1,13 +1,18 @@
-use crate::factor::VarStore;
+use crate::{factor::VarStore, gaussian::Gaussian};
 
 /// Reusable scratch buffers for `Game::likelihoods`.
 ///
 /// A `Batch` owns one arena; all events in the slice share it across
-/// the convergence iterations.
+/// the convergence iterations. All Vecs are cleared (not dropped) on
+/// `reset()` so their heap capacity is reused across games.
 #[derive(Debug, Default)]
 pub struct ScratchArena {
     pub(crate) vars: VarStore,
     pub(crate) sort_buf: Vec<usize>,
+    pub(crate) inv_buf: Vec<usize>,
+    pub(crate) team_prior: Vec<Gaussian>,
+    pub(crate) lhood_lose: Vec<Gaussian>,
+    pub(crate) lhood_win: Vec<Gaussian>,
 }
 
 impl ScratchArena {
@@ -19,25 +24,33 @@ impl ScratchArena {
     pub(crate) fn reset(&mut self) {
         self.vars.clear();
         self.sort_buf.clear();
+        self.inv_buf.clear();
+        self.team_prior.clear();
+        self.lhood_lose.clear();
+        self.lhood_win.clear();
     }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::N_INF;
+    use crate::{N_INF, gaussian::Gaussian};
 
     #[test]
     fn reset_keeps_capacity() {
         let mut arena = ScratchArena::new();
         arena.vars.alloc(N_INF);
         arena.sort_buf.push(42);
+        arena.team_prior.push(Gaussian::from_ms(0.0, 1.0));
         let var_cap = arena.vars.marginals.capacity();
         let sort_cap = arena.sort_buf.capacity();
+        let prior_cap = arena.team_prior.capacity();
         arena.reset();
         assert_eq!(arena.vars.len(), 0);
         assert_eq!(arena.sort_buf.len(), 0);
+        assert_eq!(arena.team_prior.len(), 0);
         assert_eq!(arena.vars.marginals.capacity(), var_cap);
         assert_eq!(arena.sort_buf.capacity(), sort_cap);
+        assert_eq!(arena.team_prior.capacity(), prior_cap);
     }
 }
diff --git a/src/game.rs b/src/game.rs
index e45d676..7a34a64 100644
--- a/src/game.rs
+++ b/src/game.rs
@@ -79,21 +79,18 @@ impl<'a, D: Drift> Game<'a, D> {
                 .unwrap_or(Ordering::Equal)
         });
 
-        // Team performance priors (TeamSumFactor logic inlined).
-        let team_prior: Vec<Gaussian> = arena
-            .sort_buf
-            .iter()
-            .map(|&t| {
-                self.teams[t]
-                    .iter()
-                    .zip(self.weights[t].iter())
-                    .fold(N00, |p, (player, &w)| p + (player.performance() * w))
-            })
-            .collect();
+        // Team performance priors written into arena buffer (capacity reused across games).
+        arena.team_prior.extend(arena.sort_buf.iter().map(|&t| {
+            self.teams[t]
+                .iter()
+                .zip(self.weights[t].iter())
+                .fold(N00, |p, (player, &w)| p + (player.performance() * w))
+        }));
 
         let n_diffs = n_teams.saturating_sub(1);
 
         // One TruncFactor per adjacent sorted-team pair; each owns a diff VarId.
+        // trunc stays local (fresh state per game; Vec capacity is typically small).
         let mut trunc: Vec<TruncFactor> = (0..n_diffs)
             .map(|i| {
                 let tie = self.result[arena.sort_buf[i]] == self.result[arena.sort_buf[i + 1]];
@@ -116,22 +113,8 @@ impl<'a, D: Drift> Game<'a, D> {
             .collect();
 
         // Per-team messages from neighbouring RankDiff factors (replaces TeamMessage).
-        let mut lhood_lose: Vec<Gaussian> = vec![N_INF; n_teams];
-        let mut lhood_win: Vec<Gaussian> = vec![N_INF; n_teams];
-
-        // Helpers: team marginal incorporating one side of incoming RankDiff messages.
-        // post_win(i) = what team i presents to the diff factor on its "winning" side.
-        // post_lose(i) = what team i presents to the diff factor on its "losing" side.
-        macro_rules! post_win {
-            ($i:expr) => {
-                team_prior[$i] * lhood_lose[$i]
-            };
-        }
-        macro_rules! post_lose {
-            ($i:expr) => {
-                team_prior[$i] * lhood_win[$i]
-            };
-        }
+        arena.lhood_lose.resize(n_teams, N_INF);
+        arena.lhood_win.resize(n_teams, N_INF);
 
         let mut step = (f64::INFINITY, f64::INFINITY);
         let mut iter = 0;
@@ -140,45 +123,51 @@ impl<'a, D: Drift> Game<'a, D> {
             step = (0.0_f64, 0.0_f64);
 
             // Forward sweep: diffs 0 .. n_diffs-2 (all but the last).
-            for e in 0..n_diffs.saturating_sub(1) {
-                let raw = post_win!(e) - post_lose!(e + 1);
-                // Set diff var = raw × trunc.msg so that cavity = raw.
-                arena.vars.set(trunc[e].diff, raw * trunc[e].msg);
-                let d = trunc[e].propagate(&mut arena.vars);
+            for (e, tf) in trunc[..n_diffs.saturating_sub(1)].iter_mut().enumerate() {
+                let pw = arena.team_prior[e] * arena.lhood_lose[e];
+                let pl = arena.team_prior[e + 1] * arena.lhood_win[e + 1];
+                let raw = pw - pl;
+                arena.vars.set(tf.diff, raw * tf.msg);
+                let d = tf.propagate(&mut arena.vars);
                 step = tuple_max(step, d);
 
-                let new_ll = post_win!(e) - trunc[e].msg;
-                step = tuple_max(step, lhood_lose[e + 1].delta(new_ll));
-                lhood_lose[e + 1] = new_ll;
+                let new_ll = pw - tf.msg;
+                step = tuple_max(step, arena.lhood_lose[e + 1].delta(new_ll));
+                arena.lhood_lose[e + 1] = new_ll;
             }
 
             // Backward sweep: diffs n_diffs-1 .. 1 (reverse, all but the first).
-            for e in (1..n_diffs).rev() {
-                let raw = post_win!(e) - post_lose!(e + 1);
-                arena.vars.set(trunc[e].diff, raw * trunc[e].msg);
-                let d = trunc[e].propagate(&mut arena.vars);
+            for (rev_i, tf) in trunc[1..].iter_mut().rev().enumerate() {
+                let e = n_diffs - 1 - rev_i;
+                let pw = arena.team_prior[e] * arena.lhood_lose[e];
+                let pl = arena.team_prior[e + 1] * arena.lhood_win[e + 1];
+                let raw = pw - pl;
+                arena.vars.set(tf.diff, raw * tf.msg);
+                let d = tf.propagate(&mut arena.vars);
                 step = tuple_max(step, d);
 
-                let new_lw = post_lose!(e + 1) + trunc[e].msg;
-                step = tuple_max(step, lhood_win[e].delta(new_lw));
-                lhood_win[e] = new_lw;
+                let new_lw = pl + tf.msg;
+                step = tuple_max(step, arena.lhood_win[e].delta(new_lw));
+                arena.lhood_win[e] = new_lw;
             }
 
             iter += 1;
         }
 
-        // Special case: exactly 1 diff (2-team game).  The loop body is empty
-        // for this case (both ranges are empty), so we run the factor once here.
+        // Special case: exactly 1 diff (2-team game); loop body was empty.
         if n_diffs == 1 {
-            let raw = post_win!(0) - post_lose!(1);
+            let raw = (arena.team_prior[0] * arena.lhood_lose[0])
+                - (arena.team_prior[1] * arena.lhood_win[1]);
             arena.vars.set(trunc[0].diff, raw * trunc[0].msg);
             trunc[0].propagate(&mut arena.vars);
         }
 
         // Boundary updates: close the chain at both ends.
         if n_diffs > 0 {
-            lhood_win[0] = post_lose!(1) + trunc[0].msg;
-            lhood_lose[n_teams - 1] = post_win!(n_teams - 2) - trunc[n_diffs - 1].msg;
+            let pl1 = arena.team_prior[1] * arena.lhood_win[1];
+            arena.lhood_win[0] = pl1 + trunc[0].msg;
+            let pw_last = arena.team_prior[n_teams - 2] * arena.lhood_lose[n_teams - 2];
+            arena.lhood_lose[n_teams - 1] = pw_last - trunc[n_diffs - 1].msg;
         }
 
         // Evidence = product of per-diff evidences (each cached on first propagation).
@@ -187,15 +176,10 @@ impl<'a, D: Drift> Game<'a, D> {
             .map(|t| t.evidence_cached.unwrap_or(1.0))
             .product();
 
-        // Per-team "likelihood" = product of incoming RankDiff messages.
-        let m_t_ft: Vec<Gaussian> = (0..n_teams)
-            .map(|si| lhood_win[si] * lhood_lose[si])
-            .collect();
-
-        // Inverse permutation: inv[orig_i] = sorted_i (O(n), avoids clone + O(n²) search).
-        let mut inv = vec![0usize; n_teams];
+        // Inverse permutation: inv_buf[orig_i] = sorted_i.
+        arena.inv_buf.resize(n_teams, 0);
         for (si, &orig_i) in arena.sort_buf.iter().enumerate() {
-            inv[orig_i] = si;
+            arena.inv_buf[orig_i] = si;
         }
 
         self.likelihoods = self
@@ -204,8 +188,8 @@ impl<'a, D: Drift> Game<'a, D> {
             .zip(self.weights.iter())
             .enumerate()
             .map(|(orig_i, (players, weights))| {
-                let sorted_i = inv[orig_i];
-                let m = m_t_ft[sorted_i];
+                let si = arena.inv_buf[orig_i];
+                let m = arena.lhood_win[si] * arena.lhood_lose[si];
                 let performance = players
                     .iter()
                     .zip(weights.iter())