From f0d62113870d26779e4c9c99db8c013e8a9fffcf Mon Sep 17 00:00:00 2001
From: Anders Olsson <anders.e.olsson@gmail.com>
Date: Fri, 24 Apr 2026 14:55:37 +0200
Subject: [PATCH] =?UTF-8?q?perf(game):=20revert=20Task=2010=20SmallVec=20c?=
 =?UTF-8?q?hanges=20=E2=80=94=20caused=20sequential=20regression?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Vec<Vec<_>> → SmallVec<[SmallVec<[_;8]>;8]> change in Task 10
regressed Batch::iteration from 23.29 µs to 29.73 µs (+28%). The
SmallVec was motivated by reducing parallel-path allocations but
it hurt the sequential path substantially.

Reverting game.rs + time_slice.rs + history.rs storage back to the T2
Vec<Vec<_>> shape. The parallel rayon path (unsafe direct-write +
thread_local ScratchArena + RAYON_THRESHOLD=64 fallback) stays — it
is independent of Game's internal storage.

Benchmarks after revert:
  Batch::iteration (seq, no rayon): 23.23 µs (restored ≈T2)
  Batch::iteration (rayon):         24.57 µs
  history_converge/500x100@10:       4.03 ms seq,  4.24 ms rayon — 1.0×
  history_converge/2000x200@20:     20.18 ms seq, 19.82 ms rayon — 1.0×
  history_converge/1v1-5000x50000@5000: 11.88 ms seq, 9.10 ms rayon — 1.3×

Part of T3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benches/history_converge.rs | 15 ++++++++-------
 src/game.rs                 | 32 ++++++++------------------------
 src/history.rs              |  2 +-
 src/time_slice.rs           | 25 ++++++++++++-------------
 4 files changed, 29 insertions(+), 45 deletions(-)
diff --git a/benches/history_converge.rs b/benches/history_converge.rs
index b3a4ea0..e5163a8 100644
--- a/benches/history_converge.rs
+++ b/benches/history_converge.rs
@@ -10,17 +10,18 @@
 //! The rayon thread pool is initialised to `min(P-cores, available)` to
 //! avoid scheduling onto the slower E-cores.
 //!
-//! ## Results (Apple M5 Pro, 2026-04-24, 5 P-core threads)
+//! ## Results (Apple M5 Pro, 2026-04-24, after SmallVec revert)
 //!
 //! | Workload                                    | Sequential  | Parallel   | Speedup |
 //! |---------------------------------------------|------------:|-----------:|--------:|
-//! | History::converge/500x100@10perslice        |     4.71 ms |    4.79 ms |   1.0×  |
-//! | History::converge/2000x200@20perslice       |    23.36 ms |   23.28 ms |   1.0×  |
-//! | History::converge/1v1-5000x50000@5000perslice|   13.90 ms |    6.99 ms |  **2.0×** |
+//! | History::converge/500x100@10perslice        |     4.03 ms |    4.24 ms |   1.0×  |
+//! | History::converge/2000x200@20perslice       |    20.18 ms |   19.82 ms |   1.0×  |
+//! | History::converge/1v1-5000x50000@5000perslice|   11.88 ms |    9.10 ms |   1.3×  |
 //!
-//! T3 acceptance gate: ≥2× speedup on at least one workload — ACHIEVED.
-//! Small workloads fall below the RAYON_THRESHOLD (64 events/color) and
-//! run sequentially with near-zero overhead.
+//! T3 acceptance gate: ≥2× speedup on at least one workload — NOT achieved after revert.
+//! The SmallVec storage that enabled the 2× gate caused a +28% regression in the
+//! sequential Batch::iteration benchmark and was reverted. Small workloads still fall
+//! below the RAYON_THRESHOLD (64 events/color) and run sequentially with near-zero overhead.
 
 use criterion::{BatchSize, Criterion, criterion_group, criterion_main};
 use smallvec::smallvec;
diff --git a/src/game.rs b/src/game.rs
index 617e5c3..16be834 100644
--- a/src/game.rs
+++ b/src/game.rs
@@ -1,7 +1,5 @@
 use std::cmp::Ordering;
 
-use smallvec::SmallVec;
-
 use crate::{
     N_INF, N00,
     arena::ScratchArena,
@@ -14,9 +12,6 @@ use crate::{
     tuple_gt, tuple_max,
 };
 
-type Teams<T, D> = SmallVec<[SmallVec<[Rating<T, D>; 8]>; 8]>;
-type Likelihoods = SmallVec<[SmallVec<[Gaussian; 8]>; 8]>;
-
 #[derive(Clone, Copy, Debug)]
 pub struct GameOptions {
     pub p_draw: f64,
@@ -44,7 +39,7 @@ pub struct OwnedGame<T: Time, D: Drift<T>> {
     result: Vec<f64>,
     weights: Vec<Vec<f64>>,
     p_draw: f64,
-    pub(crate) likelihoods: Likelihoods,
+    pub(crate) likelihoods: Vec<Vec<Gaussian>>,
     pub(crate) evidence: f64,
 }
 
@@ -84,11 +79,11 @@ impl<T: Time, D: Drift<T>> OwnedGame<T, D> {
 
 #[derive(Debug)]
 pub struct Game<'a, T: Time = i64, D: Drift<T> = crate::drift::ConstantDrift> {
-    teams: Teams<T, D>,
+    teams: Vec<Vec<Rating<T, D>>>,
     result: &'a [f64],
     weights: &'a [Vec<f64>],
     p_draw: f64,
-    pub(crate) likelihoods: Likelihoods,
+    pub(crate) likelihoods: Vec<Vec<Gaussian>>,
     pub(crate) evidence: f64,
 }
 
@@ -99,17 +94,6 @@ impl<'a, T: Time, D: Drift<T>> Game<'a, T, D> {
         weights: &'a [Vec<f64>],
         p_draw: f64,
         arena: &mut ScratchArena,
-    ) -> Self {
-        let teams_sv: Teams<T, D> = teams.into_iter().map(|t| t.into_iter().collect()).collect();
-        Self::ranked_with_arena_sv(teams_sv, result, weights, p_draw, arena)
-    }
-
-    pub(crate) fn ranked_with_arena_sv(
-        teams: Teams<T, D>,
-        result: &'a [f64],
-        weights: &'a [Vec<f64>],
-        p_draw: f64,
-        arena: &mut ScratchArena,
     ) -> Self {
         debug_assert!(
             result.len() == teams.len(),
@@ -140,7 +124,7 @@ impl<'a, T: Time, D: Drift<T>> Game<'a, T, D> {
             result,
             weights,
             p_draw,
-            likelihoods: SmallVec::new(),
+            likelihoods: Vec::new(),
             evidence: 0.0,
         };
 
@@ -172,8 +156,8 @@ impl<'a, T: Time, D: Drift<T>> Game<'a, T, D> {
         let n_diffs = n_teams.saturating_sub(1);
 
         // One TruncFactor per adjacent sorted-team pair; each owns a diff VarId.
-        // SmallVec avoids heap allocation for the common 2-team case (1 diff).
-        let mut trunc: SmallVec<[TruncFactor; 8]> = (0..n_diffs)
+        // trunc stays local (fresh state per game; Vec capacity is typically small).
+        let mut trunc: Vec<TruncFactor> = (0..n_diffs)
             .map(|i| {
                 let tie = self.result[arena.sort_buf[i]] == self.result[arena.sort_buf[i + 1]];
                 let margin = if self.p_draw == 0.0 {
@@ -283,9 +267,9 @@ impl<'a, T: Time, D: Drift<T>> Game<'a, T, D> {
                         ((m - performance.exclude(player.performance() * w)) * (1.0 / w))
                             .forget(player.beta.powi(2))
                     })
-                    .collect::<SmallVec<[Gaussian; 8]>>()
+                    .collect::<Vec<_>>()
             })
-            .collect::<Likelihoods>();
+            .collect::<Vec<_>>();
     }
 
     pub fn posteriors(&self) -> Vec<Vec<Gaussian>> {
diff --git a/src/history.rs b/src/history.rs
index ea42c81..6d4439c 100644
--- a/src/history.rs
+++ b/src/history.rs
@@ -789,7 +789,7 @@ mod tests {
         let observed = h.time_slices[1].skills.get(a).unwrap().posterior();
 
         let w = [vec![1.0], vec![1.0]];
-        let p = Game::ranked_with_arena_sv(
+        let p = Game::ranked_with_arena(
             h.time_slices[1].events[0].within_priors(
                 false,
                 false,
diff --git a/src/time_slice.rs b/src/time_slice.rs
index b3fee41..cc19b30 100644
--- a/src/time_slice.rs
+++ b/src/time_slice.rs
@@ -4,8 +4,6 @@
 
 use std::collections::HashMap;
 
-use smallvec::SmallVec;
-
 use crate::{
     Index, N_INF,
     arena::ScratchArena,
@@ -19,8 +17,6 @@ use crate::{
     tuple_gt, tuple_max,
 };
 
-type Teams<T, D> = SmallVec<[SmallVec<[Rating<T, D>; 8]>; 8]>;
-
 #[derive(Debug)]
 pub(crate) struct Skill {
     pub(crate) forward: Gaussian,
@@ -95,8 +91,11 @@ impl Event {
             .flat_map(|t| t.items.iter().map(|it| it.agent))
     }
 
-    fn outputs(&self) -> smallvec::SmallVec<[f64; 4]> {
-        self.teams.iter().map(|team| team.output).collect()
+    fn outputs(&self) -> Vec<f64> {
+        self.teams
+            .iter()
+            .map(|team| team.output)
+            .collect::<Vec<_>>()
     }
 
     pub(crate) fn within_priors<T: Time, D: Drift<T>>(
@@ -105,16 +104,16 @@ impl Event {
         forward: bool,
         skills: &SkillStore,
         agents: &CompetitorStore<T, D>,
-    ) -> Teams<T, D> {
+    ) -> Vec<Vec<Rating<T, D>>> {
         self.teams
             .iter()
             .map(|team| {
                 team.items
                     .iter()
                     .map(|item| item.within_prior(online, forward, skills, agents))
-                    .collect()
+                    .collect::<Vec<_>>()
             })
-            .collect()
+            .collect::<Vec<_>>()
     }
 
     /// Direct in-loop update: mutates self and `skills` inline with no
@@ -130,7 +129,7 @@ impl Event {
     ) {
         let teams = self.within_priors(false, false, skills, agents);
         let result = self.outputs();
-        let g = Game::ranked_with_arena_sv(teams, &result, &self.weights, p_draw, arena);
+        let g = Game::ranked_with_arena(teams, &result, &self.weights, p_draw, arena);
 
         for (t, team) in self.teams.iter_mut().enumerate() {
             for (i, item) in team.items.iter_mut().enumerate() {
@@ -300,7 +299,7 @@ impl<T: Time> TimeSlice<T> {
                 let teams = event.within_priors(false, false, &self.skills, agents);
                 let result = event.outputs();
 
-                let g = Game::ranked_with_arena_sv(
+                let g = Game::ranked_with_arena(
                     teams,
                     &result,
                     &event.weights,
@@ -480,7 +479,7 @@ impl<T: Time> TimeSlice<T> {
                 self.events
                     .iter()
                     .map(|event| {
-                        Game::ranked_with_arena_sv(
+                        Game::ranked_with_arena(
                             event.within_priors(online, forward, &self.skills, agents),
                             &event.outputs(),
                             &event.weights,
@@ -506,7 +505,7 @@ impl<T: Time> TimeSlice<T> {
                         .any(|item| targets.contains(&item.agent))
                 })
                 .map(|(_, event)| {
-                    Game::ranked_with_arena_sv(
+                    Game::ranked_with_arena(
                         event.within_priors(online, forward, &self.skills, agents),
                         &event.outputs(),
                         &event.weights,