From f0d62113870d26779e4c9c99db8c013e8a9fffcf Mon Sep 17 00:00:00 2001 From: Anders Olsson Date: Fri, 24 Apr 2026 14:55:37 +0200 Subject: [PATCH] =?UTF-8?q?perf(game):=20revert=20Task=2010=20SmallVec=20c?= =?UTF-8?q?hanges=20=E2=80=94=20caused=20sequential=20regression?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Vec> → SmallVec<[SmallVec<[_;8]>;8]> change in Task 10 regressed Batch::iteration from 23.29 µs to 29.73 µs (+28%). The SmallVec was motivated by reducing parallel-path allocations but it hurt the sequential path substantially. Reverting game.rs + time_slice.rs + history.rs storage back to the T2 Vec> shape. The parallel rayon path (unsafe direct-write + thread_local ScratchArena + RAYON_THRESHOLD=64 fallback) stays — it is independent of Game's internal storage. Benchmarks after revert: Batch::iteration (seq, no rayon): 23.23 µs (restored ≈T2) Batch::iteration (rayon): 24.57 µs history_converge/500x100@10: 4.03 ms seq, 4.24 ms rayon — 1.0× history_converge/2000x200@20: 20.18 ms seq, 19.82 ms rayon — 1.0× history_converge/1v1-5000x50000@5000: 11.88 ms seq, 9.10 ms rayon — 1.3× Part of T3. Co-Authored-By: Claude Sonnet 4.6 --- benches/history_converge.rs | 15 ++++++++------- src/game.rs | 32 ++++++++------------------------ src/history.rs | 2 +- src/time_slice.rs | 25 ++++++++++++------------- 4 files changed, 29 insertions(+), 45 deletions(-) diff --git a/benches/history_converge.rs b/benches/history_converge.rs index b3a4ea0..e5163a8 100644 --- a/benches/history_converge.rs +++ b/benches/history_converge.rs @@ -10,17 +10,18 @@ //! The rayon thread pool is initialised to `min(P-cores, available)` to //! avoid scheduling onto the slower E-cores. //! -//! ## Results (Apple M5 Pro, 2026-04-24, 5 P-core threads) +//! ## Results (Apple M5 Pro, 2026-04-24, after SmallVec revert) //! //! | Workload | Sequential | Parallel | Speedup | //! |---------------------------------------------|------------:|-----------:|--------:| -//! | History::converge/500x100@10perslice | 4.71 ms | 4.79 ms | 1.0× | -//! | History::converge/2000x200@20perslice | 23.36 ms | 23.28 ms | 1.0× | -//! | History::converge/1v1-5000x50000@5000perslice| 13.90 ms | 6.99 ms | **2.0×** | +//! | History::converge/500x100@10perslice | 4.03 ms | 4.24 ms | 1.0× | +//! | History::converge/2000x200@20perslice | 20.18 ms | 19.82 ms | 1.0× | +//! | History::converge/1v1-5000x50000@5000perslice| 11.88 ms | 9.10 ms | 1.3× | //! -//! T3 acceptance gate: ≥2× speedup on at least one workload — ACHIEVED. -//! Small workloads fall below the RAYON_THRESHOLD (64 events/color) and -//! run sequentially with near-zero overhead. +//! T3 acceptance gate: ≥2× speedup on at least one workload — NOT achieved after revert. +//! The SmallVec storage that enabled the 2× gate caused a +28% regression in the +//! sequential Batch::iteration benchmark and was reverted. Small workloads still fall +//! below the RAYON_THRESHOLD (64 events/color) and run sequentially with near-zero overhead. use criterion::{BatchSize, Criterion, criterion_group, criterion_main}; use smallvec::smallvec; diff --git a/src/game.rs b/src/game.rs index 617e5c3..16be834 100644 --- a/src/game.rs +++ b/src/game.rs @@ -1,7 +1,5 @@ use std::cmp::Ordering; -use smallvec::SmallVec; - use crate::{ N_INF, N00, arena::ScratchArena, @@ -14,9 +12,6 @@ use crate::{ tuple_gt, tuple_max, }; -type Teams = SmallVec<[SmallVec<[Rating; 8]>; 8]>; -type Likelihoods = SmallVec<[SmallVec<[Gaussian; 8]>; 8]>; - #[derive(Clone, Copy, Debug)] pub struct GameOptions { pub p_draw: f64, @@ -44,7 +39,7 @@ pub struct OwnedGame> { result: Vec, weights: Vec>, p_draw: f64, - pub(crate) likelihoods: Likelihoods, + pub(crate) likelihoods: Vec>, pub(crate) evidence: f64, } @@ -84,11 +79,11 @@ impl> OwnedGame { #[derive(Debug)] pub struct Game<'a, T: Time = i64, D: Drift = crate::drift::ConstantDrift> { - teams: Teams, + teams: Vec>>, result: &'a [f64], weights: &'a [Vec], p_draw: f64, - pub(crate) likelihoods: Likelihoods, + pub(crate) likelihoods: Vec>, pub(crate) evidence: f64, } @@ -99,17 +94,6 @@ impl<'a, T: Time, D: Drift> Game<'a, T, D> { weights: &'a [Vec], p_draw: f64, arena: &mut ScratchArena, - ) -> Self { - let teams_sv: Teams = teams.into_iter().map(|t| t.into_iter().collect()).collect(); - Self::ranked_with_arena_sv(teams_sv, result, weights, p_draw, arena) - } - - pub(crate) fn ranked_with_arena_sv( - teams: Teams, - result: &'a [f64], - weights: &'a [Vec], - p_draw: f64, - arena: &mut ScratchArena, ) -> Self { debug_assert!( result.len() == teams.len(), @@ -140,7 +124,7 @@ impl<'a, T: Time, D: Drift> Game<'a, T, D> { result, weights, p_draw, - likelihoods: SmallVec::new(), + likelihoods: Vec::new(), evidence: 0.0, }; @@ -172,8 +156,8 @@ impl<'a, T: Time, D: Drift> Game<'a, T, D> { let n_diffs = n_teams.saturating_sub(1); // One TruncFactor per adjacent sorted-team pair; each owns a diff VarId. - // SmallVec avoids heap allocation for the common 2-team case (1 diff). - let mut trunc: SmallVec<[TruncFactor; 8]> = (0..n_diffs) + // trunc stays local (fresh state per game; Vec capacity is typically small). + let mut trunc: Vec = (0..n_diffs) .map(|i| { let tie = self.result[arena.sort_buf[i]] == self.result[arena.sort_buf[i + 1]]; let margin = if self.p_draw == 0.0 { @@ -283,9 +267,9 @@ impl<'a, T: Time, D: Drift> Game<'a, T, D> { ((m - performance.exclude(player.performance() * w)) * (1.0 / w)) .forget(player.beta.powi(2)) }) - .collect::>() + .collect::>() }) - .collect::(); + .collect::>(); } pub fn posteriors(&self) -> Vec> { diff --git a/src/history.rs b/src/history.rs index ea42c81..6d4439c 100644 --- a/src/history.rs +++ b/src/history.rs @@ -789,7 +789,7 @@ mod tests { let observed = h.time_slices[1].skills.get(a).unwrap().posterior(); let w = [vec![1.0], vec![1.0]]; - let p = Game::ranked_with_arena_sv( + let p = Game::ranked_with_arena( h.time_slices[1].events[0].within_priors( false, false, diff --git a/src/time_slice.rs b/src/time_slice.rs index b3fee41..cc19b30 100644 --- a/src/time_slice.rs +++ b/src/time_slice.rs @@ -4,8 +4,6 @@ use std::collections::HashMap; -use smallvec::SmallVec; - use crate::{ Index, N_INF, arena::ScratchArena, @@ -19,8 +17,6 @@ use crate::{ tuple_gt, tuple_max, }; -type Teams = SmallVec<[SmallVec<[Rating; 8]>; 8]>; - #[derive(Debug)] pub(crate) struct Skill { pub(crate) forward: Gaussian, @@ -95,8 +91,11 @@ impl Event { .flat_map(|t| t.items.iter().map(|it| it.agent)) } - fn outputs(&self) -> smallvec::SmallVec<[f64; 4]> { - self.teams.iter().map(|team| team.output).collect() + fn outputs(&self) -> Vec { + self.teams + .iter() + .map(|team| team.output) + .collect::>() } pub(crate) fn within_priors>( @@ -105,16 +104,16 @@ impl Event { forward: bool, skills: &SkillStore, agents: &CompetitorStore, - ) -> Teams { + ) -> Vec>> { self.teams .iter() .map(|team| { team.items .iter() .map(|item| item.within_prior(online, forward, skills, agents)) - .collect() + .collect::>() }) - .collect() + .collect::>() } /// Direct in-loop update: mutates self and `skills` inline with no @@ -130,7 +129,7 @@ impl Event { ) { let teams = self.within_priors(false, false, skills, agents); let result = self.outputs(); - let g = Game::ranked_with_arena_sv(teams, &result, &self.weights, p_draw, arena); + let g = Game::ranked_with_arena(teams, &result, &self.weights, p_draw, arena); for (t, team) in self.teams.iter_mut().enumerate() { for (i, item) in team.items.iter_mut().enumerate() { @@ -300,7 +299,7 @@ impl TimeSlice { let teams = event.within_priors(false, false, &self.skills, agents); let result = event.outputs(); - let g = Game::ranked_with_arena_sv( + let g = Game::ranked_with_arena( teams, &result, &event.weights, @@ -480,7 +479,7 @@ impl TimeSlice { self.events .iter() .map(|event| { - Game::ranked_with_arena_sv( + Game::ranked_with_arena( event.within_priors(online, forward, &self.skills, agents), &event.outputs(), &event.weights, @@ -506,7 +505,7 @@ impl TimeSlice { .any(|item| targets.contains(&item.agent)) }) .map(|(_, event)| { - Game::ranked_with_arena_sv( + Game::ranked_with_arena( event.within_priors(online, forward, &self.skills, agents), &event.outputs(), &event.weights,