perf(game): revert Task 10 SmallVec changes — caused sequential regression

The Vec<Vec<_>> → SmallVec<[SmallVec<[_;8]>;8]> change in Task 10 regressed Batch::iteration from 23.29 µs to 29.73 µs (+28%). The SmallVec was motivated by reducing parallel-path allocations but it hurt the sequential path substantially. Reverting game.rs + time_slice.rs + history.rs storage back to the T2 Vec<Vec<_>> shape. The parallel rayon path (unsafe direct-write + thread_local ScratchArena + RAYON_THRESHOLD=64 fallback) stays — it is independent of Game's internal storage. Benchmarks after revert: Batch::iteration (seq, no rayon): 23.23 µs (restored ≈T2) Batch::iteration (rayon): 24.57 µs history_converge/500x100@10: 4.03 ms seq, 4.24 ms rayon — 1.0× history_converge/2000x200@20: 20.18 ms seq, 19.82 ms rayon — 1.0× history_converge/1v1-5000x50000@5000: 11.88 ms seq, 9.10 ms rayon — 1.3× Part of T3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-24 14:55:37 +02:00
parent be515c3d8d
commit f0d6211387
4 changed files with 29 additions and 45 deletions
@@ -10,17 +10,18 @@
 //! The rayon thread pool is initialised to `min(P-cores, available)` to
 //! avoid scheduling onto the slower E-cores.
 //!
-//! ## Results (Apple M5 Pro, 2026-04-24, 5 P-core threads)
+//! ## Results (Apple M5 Pro, 2026-04-24, after SmallVec revert)
 //!
 //! | Workload                                    | Sequential  | Parallel   | Speedup |
 //! |---------------------------------------------|------------:|-----------:|--------:|
-//! | History::converge/500x100@10perslice        |     4.71 ms |    4.79 ms |   1.0×  |
+//! | History::converge/500x100@10perslice        |     4.03 ms |    4.24 ms |   1.0×  |
-//! | History::converge/2000x200@20perslice       |    23.36 ms |   23.28 ms |   1.0×  |
+//! | History::converge/2000x200@20perslice       |    20.18 ms |   19.82 ms |   1.0×  |
-//! | History::converge/1v1-5000x50000@5000perslice|   13.90 ms |    6.99 ms |  **2.0×** |
+//! | History::converge/1v1-5000x50000@5000perslice|   11.88 ms |    9.10 ms |   1.3×  |
 //!
-//! T3 acceptance gate: ≥2× speedup on at least one workload — ACHIEVED.
+//! T3 acceptance gate: ≥2× speedup on at least one workload — NOT achieved after revert.
-//! Small workloads fall below the RAYON_THRESHOLD (64 events/color) and
+//! The SmallVec storage that enabled the 2× gate caused a +28% regression in the
-//! run sequentially with near-zero overhead.
+//! sequential Batch::iteration benchmark and was reverted. Small workloads still fall
 //! below the RAYON_THRESHOLD (64 events/color) and run sequentially with near-zero overhead.
 use criterion::{BatchSize, Criterion, criterion_group, criterion_main};
 use smallvec::smallvec;
@@ -1,7 +1,5 @@
 use std::cmp::Ordering;
 use smallvec::SmallVec;
 use crate::{
    N_INF, N00,
    arena::ScratchArena,
@@ -14,9 +12,6 @@ use crate::{
    tuple_gt, tuple_max,
 };
 type Teams<T, D> = SmallVec<[SmallVec<[Rating<T, D>; 8]>; 8]>;
 type Likelihoods = SmallVec<[SmallVec<[Gaussian; 8]>; 8]>;
 #[derive(Clone, Copy, Debug)]
 pub struct GameOptions {
    pub p_draw: f64,
@@ -44,7 +39,7 @@ pub struct OwnedGame<T: Time, D: Drift<T>> {
    result: Vec<f64>,
    weights: Vec<Vec<f64>>,
    p_draw: f64,
-    pub(crate) likelihoods: Likelihoods,
+    pub(crate) likelihoods: Vec<Vec<Gaussian>>,
    pub(crate) evidence: f64,
 }
@@ -84,11 +79,11 @@ impl<T: Time, D: Drift<T>> OwnedGame<T, D> {
 #[derive(Debug)]
 pub struct Game<'a, T: Time = i64, D: Drift<T> = crate::drift::ConstantDrift> {
-    teams: Teams<T, D>,
+    teams: Vec<Vec<Rating<T, D>>>,
    result: &'a [f64],
    weights: &'a [Vec<f64>],
    p_draw: f64,
-    pub(crate) likelihoods: Likelihoods,
+    pub(crate) likelihoods: Vec<Vec<Gaussian>>,
    pub(crate) evidence: f64,
 }
@@ -99,17 +94,6 @@ impl<'a, T: Time, D: Drift<T>> Game<'a, T, D> {
        weights: &'a [Vec<f64>],
        p_draw: f64,
        arena: &mut ScratchArena,
    ) -> Self {
        let teams_sv: Teams<T, D> = teams.into_iter().map(|t| t.into_iter().collect()).collect();
        Self::ranked_with_arena_sv(teams_sv, result, weights, p_draw, arena)
    }
    pub(crate) fn ranked_with_arena_sv(
        teams: Teams<T, D>,
        result: &'a [f64],
        weights: &'a [Vec<f64>],
        p_draw: f64,
        arena: &mut ScratchArena,
    ) -> Self {
        debug_assert!(
            result.len() == teams.len(),
@@ -140,7 +124,7 @@ impl<'a, T: Time, D: Drift<T>> Game<'a, T, D> {
            result,
            weights,
            p_draw,
-            likelihoods: SmallVec::new(),
+            likelihoods: Vec::new(),
            evidence: 0.0,
        };
@@ -172,8 +156,8 @@ impl<'a, T: Time, D: Drift<T>> Game<'a, T, D> {
        let n_diffs = n_teams.saturating_sub(1);
        // One TruncFactor per adjacent sorted-team pair; each owns a diff VarId.
-        // SmallVec avoids heap allocation for the common 2-team case (1 diff).
+        // trunc stays local (fresh state per game; Vec capacity is typically small).
-        let mut trunc: SmallVec<[TruncFactor; 8]> = (0..n_diffs)
+        let mut trunc: Vec<TruncFactor> = (0..n_diffs)
            .map(|i| {
                let tie = self.result[arena.sort_buf[i]] == self.result[arena.sort_buf[i + 1]];
                let margin = if self.p_draw == 0.0 {
@@ -283,9 +267,9 @@ impl<'a, T: Time, D: Drift<T>> Game<'a, T, D> {
                        ((m - performance.exclude(player.performance() * w)) * (1.0 / w))
                            .forget(player.beta.powi(2))
                    })
-                    .collect::<SmallVec<[Gaussian; 8]>>()
+                    .collect::<Vec<_>>()
            })
-            .collect::<Likelihoods>();
+            .collect::<Vec<_>>();
    }
    pub fn posteriors(&self) -> Vec<Vec<Gaussian>> {
@@ -789,7 +789,7 @@ mod tests {
        let observed = h.time_slices[1].skills.get(a).unwrap().posterior();
        let w = [vec![1.0], vec![1.0]];
-        let p = Game::ranked_with_arena_sv(
+        let p = Game::ranked_with_arena(
            h.time_slices[1].events[0].within_priors(
                false,
                false,
@@ -4,8 +4,6 @@
 use std::collections::HashMap;
 use smallvec::SmallVec;
 use crate::{
    Index, N_INF,
    arena::ScratchArena,
@@ -19,8 +17,6 @@ use crate::{
    tuple_gt, tuple_max,
 };
 type Teams<T, D> = SmallVec<[SmallVec<[Rating<T, D>; 8]>; 8]>;
 #[derive(Debug)]
 pub(crate) struct Skill {
    pub(crate) forward: Gaussian,
@@ -95,8 +91,11 @@ impl Event {
            .flat_map(|t| t.items.iter().map(|it| it.agent))
    }
-    fn outputs(&self) -> smallvec::SmallVec<[f64; 4]> {
+    fn outputs(&self) -> Vec<f64> {
-        self.teams.iter().map(|team| team.output).collect()
+        self.teams
            .iter()
            .map(|team| team.output)
            .collect::<Vec<_>>()
    }
    pub(crate) fn within_priors<T: Time, D: Drift<T>>(
@@ -105,16 +104,16 @@ impl Event {
        forward: bool,
        skills: &SkillStore,
        agents: &CompetitorStore<T, D>,
-    ) -> Teams<T, D> {
+    ) -> Vec<Vec<Rating<T, D>>> {
        self.teams
            .iter()
            .map(|team| {
                team.items
                    .iter()
                    .map(|item| item.within_prior(online, forward, skills, agents))
-                    .collect()
+                    .collect::<Vec<_>>()
            })
-            .collect()
+            .collect::<Vec<_>>()
    }
    /// Direct in-loop update: mutates self and `skills` inline with no
@@ -130,7 +129,7 @@ impl Event {
    ) {
        let teams = self.within_priors(false, false, skills, agents);
        let result = self.outputs();
-        let g = Game::ranked_with_arena_sv(teams, &result, &self.weights, p_draw, arena);
+        let g = Game::ranked_with_arena(teams, &result, &self.weights, p_draw, arena);
        for (t, team) in self.teams.iter_mut().enumerate() {
            for (i, item) in team.items.iter_mut().enumerate() {
@@ -300,7 +299,7 @@ impl<T: Time> TimeSlice<T> {
                let teams = event.within_priors(false, false, &self.skills, agents);
                let result = event.outputs();
-                let g = Game::ranked_with_arena_sv(
+                let g = Game::ranked_with_arena(
                    teams,
                    &result,
                    &event.weights,
@@ -480,7 +479,7 @@ impl<T: Time> TimeSlice<T> {
                self.events
                    .iter()
                    .map(|event| {
-                        Game::ranked_with_arena_sv(
+                        Game::ranked_with_arena(
                            event.within_priors(online, forward, &self.skills, agents),
                            &event.outputs(),
                            &event.weights,
@@ -506,7 +505,7 @@ impl<T: Time> TimeSlice<T> {
                        .any(|item| targets.contains(&item.agent))
                })
                .map(|(_, event)| {
-                    Game::ranked_with_arena_sv(
+                    Game::ranked_with_arena(
                        event.within_priors(online, forward, &self.skills, agents),
                        &event.outputs(),
                        &event.weights,