trueskill-tt/benches/history_converge.rs

//! End-to-end History::converge benchmark.
//!
//! Workload shapes designed to expose rayon's within-slice color-group
//! parallelism. Events in the same color group are processed in parallel
//! via direct-write with disjoint index sets (no data races). Color groups
//! smaller than a threshold fall back to the sequential path to avoid
//! rayon overhead on small workloads.
//!
//! On Apple M5 Pro, the P-core count (6) is the optimal thread count.
//! The rayon thread pool is initialised to `min(P-cores, available)` to
//! avoid scheduling onto the slower E-cores.
//!
//! ## Results (Apple M5 Pro, 2026-04-24, after SmallVec revert)
//!
//! | Workload                                    | Sequential  | Parallel   | Speedup |
//! |---------------------------------------------|------------:|-----------:|--------:|
//! | History::converge/500x100@10perslice        |     4.03 ms |    4.24 ms |   1.0×  |
//! | History::converge/2000x200@20perslice       |    20.18 ms |   19.82 ms |   1.0×  |
//! | History::converge/1v1-5000x50000@5000perslice|   11.88 ms |    9.10 ms |   1.3×  |
//!
//! T3 acceptance gate: ≥2× speedup on at least one workload — NOT achieved after revert.
//! The SmallVec storage that enabled the 2× gate caused a +28% regression in the
//! sequential Batch::iteration benchmark and was reverted. Small workloads still fall
//! below the RAYON_THRESHOLD (64 events/color) and run sequentially with near-zero overhead.

use criterion::{BatchSize, Criterion, criterion_group, criterion_main};
use smallvec::smallvec;
use trueskill_tt::{
    ConstantDrift, ConvergenceOptions, Event, History, Member, NullObserver, Outcome, Team,
};

fn build_history_1v1(
    n_events: usize,
    n_competitors: usize,
    events_per_slice: usize,
    seed: u64,
) -> History<i64, ConstantDrift, NullObserver, String> {
    let mut rng = seed;
    let mut next = || {
        rng = rng
            .wrapping_mul(6364136223846793005)
            .wrapping_add(1442695040888963407);
        rng
    };

    let mut h = History::<i64, _, _, String>::builder_with_key()
        .mu(25.0)
        .sigma(25.0 / 3.0)
        .beta(25.0 / 6.0)
        .drift(ConstantDrift(25.0 / 300.0))
        .convergence(ConvergenceOptions {
            max_iter: 30,
            epsilon: 1e-6,
            alpha: 1.0,
        })
        .build();

    let mut events: Vec<Event<i64, String>> = Vec::with_capacity(n_events);
    for ev_i in 0..n_events {
        let a = (next() as usize) % n_competitors;
        let mut b = (next() as usize) % n_competitors;
        while b == a {
            b = (next() as usize) % n_competitors;
        }
        events.push(Event {
            time: (ev_i as i64 / events_per_slice as i64) + 1,
            teams: smallvec![
                Team::with_members([Member::new(format!("p{a}"))]),
                Team::with_members([Member::new(format!("p{b}"))]),
            ],
            outcome: Outcome::winner((next() % 2) as u32, 2),
        });
    }
    h.add_events(events).unwrap();
    h
}

fn bench_converge(c: &mut Criterion) {
    // Two original task workloads (small per-slice event count;
    // fall below RAYON_THRESHOLD so sequential path runs — near-zero overhead).
    c.bench_function("History::converge/500x100@10perslice", |b| {
        b.iter_batched(
            || build_history_1v1(500, 100, 10, 42),
            |mut h| {
                h.converge().unwrap();
            },
            BatchSize::SmallInput,
        );
    });

    c.bench_function("History::converge/2000x200@20perslice", |b| {
        b.iter_batched(
            || build_history_1v1(2000, 200, 20, 42),
            |mut h| {
                h.converge().unwrap();
            },
            BatchSize::SmallInput,
        );
    });

    // Large single-slice workload: 5000 events, 50000 competitors.
    // All events in one slice → color-0 gets ~4900 disjoint events, well above
    // the 64-event RAYON_THRESHOLD. 30 iterations × 1 slice = 30 sweeps, each
    // parallelised across P-core threads. Shows ≥2× speedup.
    c.bench_function("History::converge/1v1-5000x50000@5000perslice", |b| {
        b.iter_batched(
            || build_history_1v1(5000, 50000, 5000, 42),
            |mut h| {
                h.converge().unwrap();
            },
            BatchSize::SmallInput,
        );
    });
}

criterion_group!(benches, bench_converge);
criterion_main!(benches);