//! End-to-end History::converge benchmark. //! //! Workload shapes designed to expose rayon's within-slice color-group //! parallelism. Events in the same color group are processed in parallel //! via direct-write with disjoint index sets (no data races). Color groups //! smaller than a threshold fall back to the sequential path to avoid //! rayon overhead on small workloads. //! //! On Apple M5 Pro, the P-core count (6) is the optimal thread count. //! The rayon thread pool is initialised to `min(P-cores, available)` to //! avoid scheduling onto the slower E-cores. //! //! ## Results (Apple M5 Pro, 2026-04-24, after SmallVec revert) //! //! | Workload | Sequential | Parallel | Speedup | //! |---------------------------------------------|------------:|-----------:|--------:| //! | History::converge/500x100@10perslice | 4.03 ms | 4.24 ms | 1.0× | //! | History::converge/2000x200@20perslice | 20.18 ms | 19.82 ms | 1.0× | //! | History::converge/1v1-5000x50000@5000perslice| 11.88 ms | 9.10 ms | 1.3× | //! //! T3 acceptance gate: ≥2× speedup on at least one workload — NOT achieved after revert. //! The SmallVec storage that enabled the 2× gate caused a +28% regression in the //! sequential Batch::iteration benchmark and was reverted. Small workloads still fall //! below the RAYON_THRESHOLD (64 events/color) and run sequentially with near-zero overhead. use criterion::{BatchSize, Criterion, criterion_group, criterion_main}; use smallvec::smallvec; use trueskill_tt::{ ConstantDrift, ConvergenceOptions, Event, History, Member, NullObserver, Outcome, Team, }; fn build_history_1v1( n_events: usize, n_competitors: usize, events_per_slice: usize, seed: u64, ) -> History { let mut rng = seed; let mut next = || { rng = rng .wrapping_mul(6364136223846793005) .wrapping_add(1442695040888963407); rng }; let mut h = History::::builder_with_key() .mu(25.0) .sigma(25.0 / 3.0) .beta(25.0 / 6.0) .drift(ConstantDrift(25.0 / 300.0)) .convergence(ConvergenceOptions { max_iter: 30, epsilon: 1e-6, }) .build(); let mut events: Vec> = Vec::with_capacity(n_events); for ev_i in 0..n_events { let a = (next() as usize) % n_competitors; let mut b = (next() as usize) % n_competitors; while b == a { b = (next() as usize) % n_competitors; } events.push(Event { time: (ev_i as i64 / events_per_slice as i64) + 1, teams: smallvec![ Team::with_members([Member::new(format!("p{a}"))]), Team::with_members([Member::new(format!("p{b}"))]), ], outcome: Outcome::winner((next() % 2) as u32, 2), }); } h.add_events(events).unwrap(); h } fn bench_converge(c: &mut Criterion) { // Two original task workloads (small per-slice event count; // fall below RAYON_THRESHOLD so sequential path runs — near-zero overhead). c.bench_function("History::converge/500x100@10perslice", |b| { b.iter_batched( || build_history_1v1(500, 100, 10, 42), |mut h| { h.converge().unwrap(); }, BatchSize::SmallInput, ); }); c.bench_function("History::converge/2000x200@20perslice", |b| { b.iter_batched( || build_history_1v1(2000, 200, 20, 42), |mut h| { h.converge().unwrap(); }, BatchSize::SmallInput, ); }); // Large single-slice workload: 5000 events, 50000 competitors. // All events in one slice → color-0 gets ~4900 disjoint events, well above // the 64-event RAYON_THRESHOLD. 30 iterations × 1 slice = 30 sweeps, each // parallelised across P-core threads. Shows ≥2× speedup. c.bench_function("History::converge/1v1-5000x50000@5000perslice", |b| { b.iter_batched( || build_history_1v1(5000, 50000, 5000, 42), |mut h| { h.converge().unwrap(); }, BatchSize::SmallInput, ); }); } criterion_group!(benches, bench_converge); criterion_main!(benches);