perf(game): revert Task 10 SmallVec changes — caused sequential regression

The Vec<Vec<_>> → SmallVec<[SmallVec<[_;8]>;8]> change in Task 10
regressed Batch::iteration from 23.29 µs to 29.73 µs (+28%). The
SmallVec was motivated by reducing parallel-path allocations but
it hurt the sequential path substantially.

Reverting game.rs + time_slice.rs + history.rs storage back to the T2
Vec<Vec<_>> shape. The parallel rayon path (unsafe direct-write +
thread_local ScratchArena + RAYON_THRESHOLD=64 fallback) stays — it
is independent of Game's internal storage.

Benchmarks after revert:
  Batch::iteration (seq, no rayon): 23.23 µs (restored ≈T2)
  Batch::iteration (rayon):         24.57 µs
  history_converge/500x100@10:       4.03 ms seq,  4.24 ms rayon — 1.0×
  history_converge/2000x200@20:     20.18 ms seq, 19.82 ms rayon — 1.0×
  history_converge/1v1-5000x50000@5000: 11.88 ms seq, 9.10 ms rayon — 1.3×

Part of T3.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-24 14:55:37 +02:00
parent be515c3d8d
commit f0d6211387
4 changed files with 29 additions and 45 deletions

View File

@@ -10,17 +10,18 @@
//! The rayon thread pool is initialised to `min(P-cores, available)` to //! The rayon thread pool is initialised to `min(P-cores, available)` to
//! avoid scheduling onto the slower E-cores. //! avoid scheduling onto the slower E-cores.
//! //!
//! ## Results (Apple M5 Pro, 2026-04-24, 5 P-core threads) //! ## Results (Apple M5 Pro, 2026-04-24, after SmallVec revert)
//! //!
//! | Workload | Sequential | Parallel | Speedup | //! | Workload | Sequential | Parallel | Speedup |
//! |---------------------------------------------|------------:|-----------:|--------:| //! |---------------------------------------------|------------:|-----------:|--------:|
//! | History::converge/500x100@10perslice | 4.71 ms | 4.79 ms | 1.0× | //! | History::converge/500x100@10perslice | 4.03 ms | 4.24 ms | 1.0× |
//! | History::converge/2000x200@20perslice | 23.36 ms | 23.28 ms | 1.0× | //! | History::converge/2000x200@20perslice | 20.18 ms | 19.82 ms | 1.0× |
//! | History::converge/1v1-5000x50000@5000perslice| 13.90 ms | 6.99 ms | **2.0×** | //! | History::converge/1v1-5000x50000@5000perslice| 11.88 ms | 9.10 ms | 1.3× |
//! //!
//! T3 acceptance gate: ≥2× speedup on at least one workload — ACHIEVED. //! T3 acceptance gate: ≥2× speedup on at least one workload — NOT achieved after revert.
//! Small workloads fall below the RAYON_THRESHOLD (64 events/color) and //! The SmallVec storage that enabled the 2× gate caused a +28% regression in the
//! run sequentially with near-zero overhead. //! sequential Batch::iteration benchmark and was reverted. Small workloads still fall
//! below the RAYON_THRESHOLD (64 events/color) and run sequentially with near-zero overhead.
use criterion::{BatchSize, Criterion, criterion_group, criterion_main}; use criterion::{BatchSize, Criterion, criterion_group, criterion_main};
use smallvec::smallvec; use smallvec::smallvec;

View File

@@ -1,7 +1,5 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use smallvec::SmallVec;
use crate::{ use crate::{
N_INF, N00, N_INF, N00,
arena::ScratchArena, arena::ScratchArena,
@@ -14,9 +12,6 @@ use crate::{
tuple_gt, tuple_max, tuple_gt, tuple_max,
}; };
type Teams<T, D> = SmallVec<[SmallVec<[Rating<T, D>; 8]>; 8]>;
type Likelihoods = SmallVec<[SmallVec<[Gaussian; 8]>; 8]>;
#[derive(Clone, Copy, Debug)] #[derive(Clone, Copy, Debug)]
pub struct GameOptions { pub struct GameOptions {
pub p_draw: f64, pub p_draw: f64,
@@ -44,7 +39,7 @@ pub struct OwnedGame<T: Time, D: Drift<T>> {
result: Vec<f64>, result: Vec<f64>,
weights: Vec<Vec<f64>>, weights: Vec<Vec<f64>>,
p_draw: f64, p_draw: f64,
pub(crate) likelihoods: Likelihoods, pub(crate) likelihoods: Vec<Vec<Gaussian>>,
pub(crate) evidence: f64, pub(crate) evidence: f64,
} }
@@ -84,11 +79,11 @@ impl<T: Time, D: Drift<T>> OwnedGame<T, D> {
#[derive(Debug)] #[derive(Debug)]
pub struct Game<'a, T: Time = i64, D: Drift<T> = crate::drift::ConstantDrift> { pub struct Game<'a, T: Time = i64, D: Drift<T> = crate::drift::ConstantDrift> {
teams: Teams<T, D>, teams: Vec<Vec<Rating<T, D>>>,
result: &'a [f64], result: &'a [f64],
weights: &'a [Vec<f64>], weights: &'a [Vec<f64>],
p_draw: f64, p_draw: f64,
pub(crate) likelihoods: Likelihoods, pub(crate) likelihoods: Vec<Vec<Gaussian>>,
pub(crate) evidence: f64, pub(crate) evidence: f64,
} }
@@ -99,17 +94,6 @@ impl<'a, T: Time, D: Drift<T>> Game<'a, T, D> {
weights: &'a [Vec<f64>], weights: &'a [Vec<f64>],
p_draw: f64, p_draw: f64,
arena: &mut ScratchArena, arena: &mut ScratchArena,
) -> Self {
let teams_sv: Teams<T, D> = teams.into_iter().map(|t| t.into_iter().collect()).collect();
Self::ranked_with_arena_sv(teams_sv, result, weights, p_draw, arena)
}
pub(crate) fn ranked_with_arena_sv(
teams: Teams<T, D>,
result: &'a [f64],
weights: &'a [Vec<f64>],
p_draw: f64,
arena: &mut ScratchArena,
) -> Self { ) -> Self {
debug_assert!( debug_assert!(
result.len() == teams.len(), result.len() == teams.len(),
@@ -140,7 +124,7 @@ impl<'a, T: Time, D: Drift<T>> Game<'a, T, D> {
result, result,
weights, weights,
p_draw, p_draw,
likelihoods: SmallVec::new(), likelihoods: Vec::new(),
evidence: 0.0, evidence: 0.0,
}; };
@@ -172,8 +156,8 @@ impl<'a, T: Time, D: Drift<T>> Game<'a, T, D> {
let n_diffs = n_teams.saturating_sub(1); let n_diffs = n_teams.saturating_sub(1);
// One TruncFactor per adjacent sorted-team pair; each owns a diff VarId. // One TruncFactor per adjacent sorted-team pair; each owns a diff VarId.
// SmallVec avoids heap allocation for the common 2-team case (1 diff). // trunc stays local (fresh state per game; Vec capacity is typically small).
let mut trunc: SmallVec<[TruncFactor; 8]> = (0..n_diffs) let mut trunc: Vec<TruncFactor> = (0..n_diffs)
.map(|i| { .map(|i| {
let tie = self.result[arena.sort_buf[i]] == self.result[arena.sort_buf[i + 1]]; let tie = self.result[arena.sort_buf[i]] == self.result[arena.sort_buf[i + 1]];
let margin = if self.p_draw == 0.0 { let margin = if self.p_draw == 0.0 {
@@ -283,9 +267,9 @@ impl<'a, T: Time, D: Drift<T>> Game<'a, T, D> {
((m - performance.exclude(player.performance() * w)) * (1.0 / w)) ((m - performance.exclude(player.performance() * w)) * (1.0 / w))
.forget(player.beta.powi(2)) .forget(player.beta.powi(2))
}) })
.collect::<SmallVec<[Gaussian; 8]>>() .collect::<Vec<_>>()
}) })
.collect::<Likelihoods>(); .collect::<Vec<_>>();
} }
pub fn posteriors(&self) -> Vec<Vec<Gaussian>> { pub fn posteriors(&self) -> Vec<Vec<Gaussian>> {

View File

@@ -789,7 +789,7 @@ mod tests {
let observed = h.time_slices[1].skills.get(a).unwrap().posterior(); let observed = h.time_slices[1].skills.get(a).unwrap().posterior();
let w = [vec![1.0], vec![1.0]]; let w = [vec![1.0], vec![1.0]];
let p = Game::ranked_with_arena_sv( let p = Game::ranked_with_arena(
h.time_slices[1].events[0].within_priors( h.time_slices[1].events[0].within_priors(
false, false,
false, false,

View File

@@ -4,8 +4,6 @@
use std::collections::HashMap; use std::collections::HashMap;
use smallvec::SmallVec;
use crate::{ use crate::{
Index, N_INF, Index, N_INF,
arena::ScratchArena, arena::ScratchArena,
@@ -19,8 +17,6 @@ use crate::{
tuple_gt, tuple_max, tuple_gt, tuple_max,
}; };
type Teams<T, D> = SmallVec<[SmallVec<[Rating<T, D>; 8]>; 8]>;
#[derive(Debug)] #[derive(Debug)]
pub(crate) struct Skill { pub(crate) struct Skill {
pub(crate) forward: Gaussian, pub(crate) forward: Gaussian,
@@ -95,8 +91,11 @@ impl Event {
.flat_map(|t| t.items.iter().map(|it| it.agent)) .flat_map(|t| t.items.iter().map(|it| it.agent))
} }
fn outputs(&self) -> smallvec::SmallVec<[f64; 4]> { fn outputs(&self) -> Vec<f64> {
self.teams.iter().map(|team| team.output).collect() self.teams
.iter()
.map(|team| team.output)
.collect::<Vec<_>>()
} }
pub(crate) fn within_priors<T: Time, D: Drift<T>>( pub(crate) fn within_priors<T: Time, D: Drift<T>>(
@@ -105,16 +104,16 @@ impl Event {
forward: bool, forward: bool,
skills: &SkillStore, skills: &SkillStore,
agents: &CompetitorStore<T, D>, agents: &CompetitorStore<T, D>,
) -> Teams<T, D> { ) -> Vec<Vec<Rating<T, D>>> {
self.teams self.teams
.iter() .iter()
.map(|team| { .map(|team| {
team.items team.items
.iter() .iter()
.map(|item| item.within_prior(online, forward, skills, agents)) .map(|item| item.within_prior(online, forward, skills, agents))
.collect() .collect::<Vec<_>>()
}) })
.collect() .collect::<Vec<_>>()
} }
/// Direct in-loop update: mutates self and `skills` inline with no /// Direct in-loop update: mutates self and `skills` inline with no
@@ -130,7 +129,7 @@ impl Event {
) { ) {
let teams = self.within_priors(false, false, skills, agents); let teams = self.within_priors(false, false, skills, agents);
let result = self.outputs(); let result = self.outputs();
let g = Game::ranked_with_arena_sv(teams, &result, &self.weights, p_draw, arena); let g = Game::ranked_with_arena(teams, &result, &self.weights, p_draw, arena);
for (t, team) in self.teams.iter_mut().enumerate() { for (t, team) in self.teams.iter_mut().enumerate() {
for (i, item) in team.items.iter_mut().enumerate() { for (i, item) in team.items.iter_mut().enumerate() {
@@ -300,7 +299,7 @@ impl<T: Time> TimeSlice<T> {
let teams = event.within_priors(false, false, &self.skills, agents); let teams = event.within_priors(false, false, &self.skills, agents);
let result = event.outputs(); let result = event.outputs();
let g = Game::ranked_with_arena_sv( let g = Game::ranked_with_arena(
teams, teams,
&result, &result,
&event.weights, &event.weights,
@@ -480,7 +479,7 @@ impl<T: Time> TimeSlice<T> {
self.events self.events
.iter() .iter()
.map(|event| { .map(|event| {
Game::ranked_with_arena_sv( Game::ranked_with_arena(
event.within_priors(online, forward, &self.skills, agents), event.within_priors(online, forward, &self.skills, agents),
&event.outputs(), &event.outputs(),
&event.weights, &event.weights,
@@ -506,7 +505,7 @@ impl<T: Time> TimeSlice<T> {
.any(|item| targets.contains(&item.agent)) .any(|item| targets.contains(&item.agent))
}) })
.map(|(_, event)| { .map(|(_, event)| {
Game::ranked_with_arena_sv( Game::ranked_with_arena(
event.within_priors(online, forward, &self.skills, agents), event.within_priors(online, forward, &self.skills, agents),
&event.outputs(), &event.outputs(),
&event.weights, &event.weights,