The Vec<Vec<_>> → SmallVec<[SmallVec<[_;8]>;8]> change in Task 10 regressed Batch::iteration from 23.29 µs to 29.73 µs (+28%). The SmallVec was motivated by reducing parallel-path allocations but it hurt the sequential path substantially. Reverting game.rs + time_slice.rs + history.rs storage back to the T2 Vec<Vec<_>> shape. The parallel rayon path (unsafe direct-write + thread_local ScratchArena + RAYON_THRESHOLD=64 fallback) stays — it is independent of Game's internal storage. Benchmarks after revert: Batch::iteration (seq, no rayon): 23.23 µs (restored ≈T2) Batch::iteration (rayon): 24.57 µs history_converge/500x100@10: 4.03 ms seq, 4.24 ms rayon — 1.0× history_converge/2000x200@20: 20.18 ms seq, 19.82 ms rayon — 1.0× history_converge/1v1-5000x50000@5000: 11.88 ms seq, 9.10 ms rayon — 1.3× Part of T3. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
117 lines
4.3 KiB
Rust
117 lines
4.3 KiB
Rust
//! End-to-end History::converge benchmark.
|
||
//!
|
||
//! Workload shapes designed to expose rayon's within-slice color-group
|
||
//! parallelism. Events in the same color group are processed in parallel
|
||
//! via direct-write with disjoint index sets (no data races). Color groups
|
||
//! smaller than a threshold fall back to the sequential path to avoid
|
||
//! rayon overhead on small workloads.
|
||
//!
|
||
//! On Apple M5 Pro, the P-core count (6) is the optimal thread count.
|
||
//! The rayon thread pool is initialised to `min(P-cores, available)` to
|
||
//! avoid scheduling onto the slower E-cores.
|
||
//!
|
||
//! ## Results (Apple M5 Pro, 2026-04-24, after SmallVec revert)
|
||
//!
|
||
//! | Workload | Sequential | Parallel | Speedup |
|
||
//! |---------------------------------------------|------------:|-----------:|--------:|
|
||
//! | History::converge/500x100@10perslice | 4.03 ms | 4.24 ms | 1.0× |
|
||
//! | History::converge/2000x200@20perslice | 20.18 ms | 19.82 ms | 1.0× |
|
||
//! | History::converge/1v1-5000x50000@5000perslice| 11.88 ms | 9.10 ms | 1.3× |
|
||
//!
|
||
//! T3 acceptance gate: ≥2× speedup on at least one workload — NOT achieved after revert.
|
||
//! The SmallVec storage that enabled the 2× gate caused a +28% regression in the
|
||
//! sequential Batch::iteration benchmark and was reverted. Small workloads still fall
|
||
//! below the RAYON_THRESHOLD (64 events/color) and run sequentially with near-zero overhead.
|
||
|
||
use criterion::{BatchSize, Criterion, criterion_group, criterion_main};
|
||
use smallvec::smallvec;
|
||
use trueskill_tt::{
|
||
ConstantDrift, ConvergenceOptions, Event, History, Member, NullObserver, Outcome, Team,
|
||
};
|
||
|
||
fn build_history_1v1(
|
||
n_events: usize,
|
||
n_competitors: usize,
|
||
events_per_slice: usize,
|
||
seed: u64,
|
||
) -> History<i64, ConstantDrift, NullObserver, String> {
|
||
let mut rng = seed;
|
||
let mut next = || {
|
||
rng = rng
|
||
.wrapping_mul(6364136223846793005)
|
||
.wrapping_add(1442695040888963407);
|
||
rng
|
||
};
|
||
|
||
let mut h = History::<i64, _, _, String>::builder_with_key()
|
||
.mu(25.0)
|
||
.sigma(25.0 / 3.0)
|
||
.beta(25.0 / 6.0)
|
||
.drift(ConstantDrift(25.0 / 300.0))
|
||
.convergence(ConvergenceOptions {
|
||
max_iter: 30,
|
||
epsilon: 1e-6,
|
||
})
|
||
.build();
|
||
|
||
let mut events: Vec<Event<i64, String>> = Vec::with_capacity(n_events);
|
||
for ev_i in 0..n_events {
|
||
let a = (next() as usize) % n_competitors;
|
||
let mut b = (next() as usize) % n_competitors;
|
||
while b == a {
|
||
b = (next() as usize) % n_competitors;
|
||
}
|
||
events.push(Event {
|
||
time: (ev_i as i64 / events_per_slice as i64) + 1,
|
||
teams: smallvec![
|
||
Team::with_members([Member::new(format!("p{a}"))]),
|
||
Team::with_members([Member::new(format!("p{b}"))]),
|
||
],
|
||
outcome: Outcome::winner((next() % 2) as u32, 2),
|
||
});
|
||
}
|
||
h.add_events(events).unwrap();
|
||
h
|
||
}
|
||
|
||
fn bench_converge(c: &mut Criterion) {
|
||
// Two original task workloads (small per-slice event count;
|
||
// fall below RAYON_THRESHOLD so sequential path runs — near-zero overhead).
|
||
c.bench_function("History::converge/500x100@10perslice", |b| {
|
||
b.iter_batched(
|
||
|| build_history_1v1(500, 100, 10, 42),
|
||
|mut h| {
|
||
h.converge().unwrap();
|
||
},
|
||
BatchSize::SmallInput,
|
||
);
|
||
});
|
||
|
||
c.bench_function("History::converge/2000x200@20perslice", |b| {
|
||
b.iter_batched(
|
||
|| build_history_1v1(2000, 200, 20, 42),
|
||
|mut h| {
|
||
h.converge().unwrap();
|
||
},
|
||
BatchSize::SmallInput,
|
||
);
|
||
});
|
||
|
||
// Large single-slice workload: 5000 events, 50000 competitors.
|
||
// All events in one slice → color-0 gets ~4900 disjoint events, well above
|
||
// the 64-event RAYON_THRESHOLD. 30 iterations × 1 slice = 30 sweeps, each
|
||
// parallelised across P-core threads. Shows ≥2× speedup.
|
||
c.bench_function("History::converge/1v1-5000x50000@5000perslice", |b| {
|
||
b.iter_batched(
|
||
|| build_history_1v1(5000, 50000, 5000, 42),
|
||
|mut h| {
|
||
h.converge().unwrap();
|
||
},
|
||
BatchSize::SmallInput,
|
||
);
|
||
});
|
||
}
|
||
|
||
criterion_group!(benches, bench_converge);
|
||
criterion_main!(benches);
|