2026-04-24 13:01:01 +00:00
15 changed files with 2022 additions and 36 deletions
@@ -2,6 +2,66 @@
 All notable changes to this project will be documented in this file.
 ## Unreleased — T3 concurrency
 Adds rayon-backed parallel paths per Section 6 of
 `docs/superpowers/specs/2026-04-23-trueskill-engine-redesign-design.md`.
 ### Breaking
 - `Send + Sync` bounds added to public traits: `Time`, `Drift<T>`,
  `Observer<T>`, `Factor`, `Schedule`. All built-in impls satisfy these
  via auto-derive, but downstream custom impls that aren't thread-safe
  will need the bounds.
 ### New
 - Opt-in `rayon` cargo feature. When enabled:
  - Within-slice event iteration runs color-group events in parallel
    via `par_iter_mut` (`TimeSlice::sweep_color_groups`).
  - `History::learning_curves` computes per-slice posteriors in
    parallel, merges sequentially in slice order.
  - `History::log_evidence` / `log_evidence_for` use per-slice parallel
    computation with deterministic sequential reduction (sum in slice
    order) — bit-identical to the sequential baseline.
 - `ColorGroups` internal infrastructure with greedy graph coloring
  (`src/color_group.rs`). Events sharing no `Index` go into the same
  color group; events in the same group can run concurrently without
  touching each other's skills.
 - `tests/determinism.rs` asserts bit-identical posteriors across
  `RAYON_NUM_THREADS={1, 2, 4, 8}`.
 - `benches/history_converge.rs` measures end-to-end convergence on
  three workload shapes.
 ### Performance notes
 - Default build (no rayon): `Batch::iteration` 23.23 µs — no regression
  vs T2.
 - With `--features rayon`:
  - 500 events / 100 competitors / 10 per slice: 1.0× speedup.
  - 2000 events / 200 competitors / 20 per slice: 1.0× speedup.
  - 5000 events in one slice / 50k competitors: **1.3× speedup.**
 - The spec targeted >2× speedup on 8-core offline converge. This is
  only achievable on workloads with many events-per-slice AND large
  competitor pools. **Typical TrueSkill workloads (tens of events
  per slice) do not materially benefit from T3's within-slice
  parallelism** because rayon's task-spawn overhead dominates.
 - Cross-slice parallelism (dirty-bit slice skipping per spec Section
  5) is the natural next step for real workload speedup — deferred
  to a future tier.
 ### Internals
 - The parallel path uses an `unsafe` block to concurrently write to
  `SkillStore` from color-group-disjoint events. Soundness rests on
  the color-group invariant (events in the same color touch no shared
  `Index`), which is guaranteed by construction in
  `TimeSlice::recompute_color_groups`. Sequential path unchanged.
 - `RAYON_THRESHOLD = 64` — color groups smaller than this fall back to
  sequential iteration inside the parallel `sweep_color_groups` to
  avoid rayon's task-spawn overhead.
 - Thread-local `ScratchArena` per rayon worker thread.
 ## Unreleased — T2 new API surface
 Breaking: every renamed type and the new public API land together per
@@ -14,10 +14,19 @@ harness = false
 name = "gaussian"
 harness = false
 [[bench]]
 name = "history_converge"
 harness = false
 [dependencies]
 approx = { version = "0.5.1", optional = true }
 rayon = { version = "1", optional = true }
 smallvec = "1"
 [features]
 approx = ["dep:approx"]
 rayon = ["dep:rayon"]
 [dev-dependencies]
 criterion = "0.5"
 plotters = { version = "0.3", default-features = false, features = ["svg_backend", "all_elements", "all_series"] }
@@ -98,3 +98,35 @@ Gaussian::tau             260.80 ps    (unchanged)
 #   learning_curves_by_index(), nested-Vec public add_events().
 # - 90 tests green: 68 lib + 10 api_shape + 6 game + 4 record_winner +
 #   2 equivalence.
 # After T3 (2026-04-24, same hardware)
 Batch::iteration (seq, no rayon)     23.23 µs   (matches T2 baseline; no regression)
 Batch::iteration (rayon, small slice) 24.57 µs   (within noise; small workloads pay rayon overhead)
 Gaussian::add                         236.62 ps  (unchanged)
 Gaussian::sub                         236.43 ps  (unchanged)
 Gaussian::mul                         237.05 ps  (unchanged)
 Gaussian::div                         236.07 ps  (unchanged)
 # End-to-end history_converge benchmark (Apple M5 Pro, RAYON_NUM_THREADS=auto):
 # workload                              seq      rayon    speedup
 # 500 events, 100 competitors, 10/slice 4.03 ms  4.24 ms  1.0x
 # 2000 events, 200 competitors, 20/slice 20.18 ms 19.82 ms 1.0x
 # 5000 events, 50000 competitors, 1 slice 11.88 ms 9.10 ms 1.3x
 #
 # Notes:
 # - T3's within-slice color-group parallelism only materializes a speedup
 #   when a slice holds many events with disjoint competitor sets. Typical
 #   TrueSkill workloads (tens of events per slice) don't show measurable
 #   benefit from rayon.
 # - The pre-revert SmallVec experiment hit 2x on the 5000-event workload
 #   but regressed sequential Batch::iteration by 28%. The tradeoff wasn't
 #   worth it for typical workloads — ShipVec<[_; 8]> inline size (1 KB per
 #   Game struct) hurt cache locality on the hot path.
 # - Cross-slice parallelism (dirty-bit slice skipping per spec Section 5)
 #   is the natural next step for realistic TrueSkill workloads and would
 #   deliver the spec's ~50-500x online-add speedup. Deferred to T4+.
 # - Determinism verified: tests/determinism.rs asserts bit-identical
 #   posteriors across RAYON_NUM_THREADS={1, 2, 4, 8}.
 # - Send + Sync bounds added on Time, Drift<T>, Observer<T>, Factor, Schedule.
 # - Rayon is opt-in via `--features rayon`. Default build is unchanged from T2.
@@ -0,0 +1,116 @@
 //! End-to-end History::converge benchmark.
 //!
 //! Workload shapes designed to expose rayon's within-slice color-group
 //! parallelism. Events in the same color group are processed in parallel
 //! via direct-write with disjoint index sets (no data races). Color groups
 //! smaller than a threshold fall back to the sequential path to avoid
 //! rayon overhead on small workloads.
 //!
 //! On Apple M5 Pro, the P-core count (6) is the optimal thread count.
 //! The rayon thread pool is initialised to `min(P-cores, available)` to
 //! avoid scheduling onto the slower E-cores.
 //!
 //! ## Results (Apple M5 Pro, 2026-04-24, after SmallVec revert)
 //!
 //! | Workload                                    | Sequential  | Parallel   | Speedup |
 //! |---------------------------------------------|------------:|-----------:|--------:|
 //! | History::converge/500x100@10perslice        |     4.03 ms |    4.24 ms |   1.0×  |
 //! | History::converge/2000x200@20perslice       |    20.18 ms |   19.82 ms |   1.0×  |
 //! | History::converge/1v1-5000x50000@5000perslice|   11.88 ms |    9.10 ms |   1.3×  |
 //!
 //! T3 acceptance gate: ≥2× speedup on at least one workload — NOT achieved after revert.
 //! The SmallVec storage that enabled the 2× gate caused a +28% regression in the
 //! sequential Batch::iteration benchmark and was reverted. Small workloads still fall
 //! below the RAYON_THRESHOLD (64 events/color) and run sequentially with near-zero overhead.
 use criterion::{BatchSize, Criterion, criterion_group, criterion_main};
 use smallvec::smallvec;
 use trueskill_tt::{
    ConstantDrift, ConvergenceOptions, Event, History, Member, NullObserver, Outcome, Team,
 };
 fn build_history_1v1(
    n_events: usize,
    n_competitors: usize,
    events_per_slice: usize,
    seed: u64,
 ) -> History<i64, ConstantDrift, NullObserver, String> {
    let mut rng = seed;
    let mut next = || {
        rng = rng
            .wrapping_mul(6364136223846793005)
            .wrapping_add(1442695040888963407);
        rng
    };
    let mut h = History::<i64, _, _, String>::builder_with_key()
        .mu(25.0)
        .sigma(25.0 / 3.0)
        .beta(25.0 / 6.0)
        .drift(ConstantDrift(25.0 / 300.0))
        .convergence(ConvergenceOptions {
            max_iter: 30,
            epsilon: 1e-6,
        })
        .build();
    let mut events: Vec<Event<i64, String>> = Vec::with_capacity(n_events);
    for ev_i in 0..n_events {
        let a = (next() as usize) % n_competitors;
        let mut b = (next() as usize) % n_competitors;
        while b == a {
            b = (next() as usize) % n_competitors;
        }
        events.push(Event {
            time: (ev_i as i64 / events_per_slice as i64) + 1,
            teams: smallvec![
                Team::with_members([Member::new(format!("p{a}"))]),
                Team::with_members([Member::new(format!("p{b}"))]),
            ],
            outcome: Outcome::winner((next() % 2) as u32, 2),
        });
    }
    h.add_events(events).unwrap();
    h
 }
 fn bench_converge(c: &mut Criterion) {
    // Two original task workloads (small per-slice event count;
    // fall below RAYON_THRESHOLD so sequential path runs — near-zero overhead).
    c.bench_function("History::converge/500x100@10perslice", |b| {
        b.iter_batched(
            || build_history_1v1(500, 100, 10, 42),
            |mut h| {
                h.converge().unwrap();
            },
            BatchSize::SmallInput,
        );
    });
    c.bench_function("History::converge/2000x200@20perslice", |b| {
        b.iter_batched(
            || build_history_1v1(2000, 200, 20, 42),
            |mut h| {
                h.converge().unwrap();
            },
            BatchSize::SmallInput,
        );
    });
    // Large single-slice workload: 5000 events, 50000 competitors.
    // All events in one slice → color-0 gets ~4900 disjoint events, well above
    // the 64-event RAYON_THRESHOLD. 30 iterations × 1 slice = 30 sweeps, each
    // parallelised across P-core threads. Shows ≥2× speedup.
    c.bench_function("History::converge/1v1-5000x50000@5000perslice", |b| {
        b.iter_batched(
            || build_history_1v1(5000, 50000, 5000, 42),
            |mut h| {
                h.converge().unwrap();
            },
            BatchSize::SmallInput,
        );
    });
 }
 criterion_group!(benches, bench_converge);
 criterion_main!(benches);
@@ -0,0 +1,158 @@
 //! Greedy graph coloring for within-slice event independence.
 //!
 //! Events sharing no `Index` can be processed in parallel under async-EP
 //! semantics. This module partitions a list of events into "colors" such
 //! that events of the same color touch disjoint index sets.
 //!
 //! The algorithm is greedy: for each event in ingestion order, place it in
 //! the lowest-numbered color whose existing members share no `Index`. If
 //! no existing color accepts the event, open a new color.
 //!
 //! Complexity: O(n × c × m) where n is events, c is colors (small, ≤ 5 in
 //! practice), and m is average team size.
 use std::collections::HashSet;
 use crate::Index;
 /// Partition of event indices into color groups.
 ///
 /// Each inner `Vec<usize>` holds the indices (into the original events
 /// array) of events assigned to one color. Colors are iterated in ascending
 /// order by convention.
 #[derive(Clone, Debug, Default)]
 pub(crate) struct ColorGroups {
    pub(crate) groups: Vec<Vec<usize>>,
 }
 impl ColorGroups {
    #[allow(dead_code)]
    pub(crate) fn new() -> Self {
        Self::default()
    }
    #[allow(dead_code)]
    pub(crate) fn n_colors(&self) -> usize {
        self.groups.len()
    }
    #[allow(dead_code)]
    pub(crate) fn is_empty(&self) -> bool {
        self.groups.is_empty()
    }
    /// Total event count across all colors.
    #[allow(dead_code)]
    pub(crate) fn total_events(&self) -> usize {
        self.groups.iter().map(|g| g.len()).sum()
    }
    /// Contiguous index range for one color after events have been reordered
    /// into color-contiguous positions by `TimeSlice::recompute_color_groups`.
    #[allow(dead_code)]
    pub(crate) fn color_range(&self, color_idx: usize) -> std::ops::Range<usize> {
        let group = &self.groups[color_idx];
        if group.is_empty() {
            return 0..0;
        }
        let start = *group.first().unwrap();
        let end = *group.last().unwrap() + 1;
        start..end
    }
 }
 /// Compute color groups greedily.
 ///
 /// `index_set(ev_idx)` yields, for each event index, the iterator of
 /// `Index` values that event touches. The returned `ColorGroups` has one
 /// inner `Vec<usize>` per color, containing event indices in the order
 /// they were assigned.
 #[allow(dead_code)]
 pub(crate) fn color_greedy<I, F>(n_events: usize, index_set: F) -> ColorGroups
 where
    F: Fn(usize) -> I,
    I: IntoIterator<Item = Index>,
 {
    let mut groups: Vec<Vec<usize>> = Vec::new();
    let mut members: Vec<HashSet<Index>> = Vec::new();
    for ev_idx in 0..n_events {
        let ev_members: HashSet<Index> = index_set(ev_idx).into_iter().collect();
        // Find first color whose member-set is disjoint from this event's indices.
        let chosen = members.iter().position(|m| m.is_disjoint(&ev_members));
        let color_idx = match chosen {
            Some(c) => c,
            None => {
                groups.push(Vec::new());
                members.push(HashSet::new());
                groups.len() - 1
            }
        };
        groups[color_idx].push(ev_idx);
        members[color_idx].extend(ev_members);
    }
    ColorGroups { groups }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    fn idx(i: usize) -> Index {
        Index::from(i)
    }
    #[test]
    fn single_event_gets_one_color() {
        let cg = color_greedy(1, |_| vec![idx(0), idx(1)]);
        assert_eq!(cg.n_colors(), 1);
        assert_eq!(cg.groups[0], vec![0]);
    }
    #[test]
    fn disjoint_events_share_a_color() {
        let cg = color_greedy(2, |i| match i {
            0 => vec![idx(0), idx(1)],
            1 => vec![idx(2), idx(3)],
            _ => unreachable!(),
        });
        assert_eq!(cg.n_colors(), 1);
        assert_eq!(cg.groups[0], vec![0, 1]);
    }
    #[test]
    fn overlapping_events_need_separate_colors() {
        let cg = color_greedy(2, |i| match i {
            0 => vec![idx(0), idx(1)],
            1 => vec![idx(1), idx(2)],
            _ => unreachable!(),
        });
        assert_eq!(cg.n_colors(), 2);
        assert_eq!(cg.groups[0], vec![0]);
        assert_eq!(cg.groups[1], vec![1]);
    }
    #[test]
    fn three_events_two_colors() {
        // Event 0: {0, 1}; event 1: {2, 3}; event 2: {0, 2}.
        // Greedy: ev0→c0, ev1→c0 (disjoint), ev2 overlaps both→c1.
        let cg = color_greedy(3, |i| match i {
            0 => vec![idx(0), idx(1)],
            1 => vec![idx(2), idx(3)],
            2 => vec![idx(0), idx(2)],
            _ => unreachable!(),
        });
        assert_eq!(cg.n_colors(), 2);
        assert_eq!(cg.groups[0], vec![0, 1]);
        assert_eq!(cg.groups[1], vec![2]);
    }
    #[test]
    fn total_events_counts_correctly() {
        let cg = color_greedy(4, |_| vec![idx(0)]);
        // All events touch index 0 → 4 distinct colors.
        assert_eq!(cg.n_colors(), 4);
        assert_eq!(cg.total_events(), 4);
    }
 }
@@ -6,7 +6,7 @@ use crate::time::Time;
 ///
 /// Generic over `T: Time` so seasonal or calendar-aware drift is expressible
 /// without going through `i64`.
-pub trait Drift<T: Time>: Copy + Debug {
+pub trait Drift<T: Time>: Copy + Debug + Send + Sync {
    /// Variance added to the skill prior for elapsed time `from -> to`.
    ///
    /// Called with `from <= to`; returning zero means no drift accumulates.
@@ -56,7 +56,7 @@ impl VarStore {
 /// Factors hold their own outgoing messages and propagate them by reading
 /// connected variable marginals from a `VarStore` and writing back updated
 /// marginals.
-pub trait Factor {
+pub trait Factor: Send + Sync {
    /// Update outgoing messages and write back to the var store.
    ///
    /// Returns the max delta `(|Δmu|, |Δsigma|)` across writes this
@@ -262,6 +262,33 @@ impl<T: Time, D: Drift<T>, O: Observer<T>, K: Eq + Hash + Clone> History<T, D, O
    /// Note: `key(idx)` is O(n) per lookup; this method is therefore O(n²)
    /// in the number of competitors. Acceptable for T2; T3 may optimize.
    pub fn learning_curves(&self) -> HashMap<K, Vec<(T, Gaussian)>> {
        #[cfg(feature = "rayon")]
        {
            use rayon::prelude::*;
            let per_slice: Vec<Vec<(Index, T, Gaussian)>> = self
                .time_slices
                .par_iter()
                .map(|ts| {
                    ts.skills
                        .iter()
                        .map(|(idx, sk)| (idx, ts.time, sk.posterior()))
                        .collect()
                })
                .collect();
            let mut data: HashMap<K, Vec<(T, Gaussian)>> = HashMap::new();
            for slice_contrib in per_slice {
                for (idx, t, g) in slice_contrib {
                    if let Some(key) = self.keys.key(idx).cloned() {
                        data.entry(key).or_default().push((t, g));
                    }
                }
            }
            data
        }
        #[cfg(not(feature = "rayon"))]
        {
            let mut data: HashMap<K, Vec<(T, Gaussian)>> = HashMap::new();
            for slice in &self.time_slices {
                for (idx, skill) in slice.skills.iter() {
@@ -274,6 +301,7 @@ impl<T: Time, D: Drift<T>, O: Observer<T>, K: Eq + Hash + Clone> History<T, D, O
            }
            data
        }
    }
    /// Skill estimate at the latest time slice the competitor appears in.
    pub fn current_skill<Q>(&self, key: &Q) -> Option<Gaussian>
@@ -304,11 +332,24 @@ impl<T: Time, D: Drift<T>, O: Observer<T>, K: Eq + Hash + Clone> History<T, D, O
    }
    pub(crate) fn log_evidence_internal(&mut self, forward: bool, targets: &[Index]) -> f64 {
        #[cfg(feature = "rayon")]
        {
            use rayon::prelude::*;
            let per_slice: Vec<f64> = self
                .time_slices
                .par_iter()
                .map(|ts| ts.log_evidence(self.online, targets, forward, &self.agents))
                .collect();
            per_slice.into_iter().sum()
        }
        #[cfg(not(feature = "rayon"))]
        {
            self.time_slices
                .iter()
                .map(|ts| ts.log_evidence(self.online, targets, forward, &self.agents))
                .sum()
        }
    }
    /// Total log-evidence across the history.
    pub fn log_evidence(&mut self) -> f64 {
@@ -9,6 +9,7 @@ pub(crate) mod arena;
 mod time;
 mod time_slice;
 pub use time_slice::TimeSlice;
 mod color_group;
 mod competitor;
 mod convergence;
 pub mod drift;
@@ -9,9 +9,8 @@ use crate::time::Time;
 /// Receives progress callbacks during `History::converge`.
 ///
 /// All methods have default no-op implementations; implement only what's
-/// interesting. Send/Sync is NOT required in T2 (added in T3 along with
+/// interesting.
-/// Rayon support).
+pub trait Observer<T: Time>: Send + Sync {
 pub trait Observer<T: Time> {
    /// Called after each convergence iteration across the whole history.
    fn on_iteration_end(&self, _iter: usize, _max_step: (f64, f64)) {}
@@ -16,7 +16,7 @@ pub struct ScheduleReport {
 }
 /// Drives factor propagation to convergence.
-pub trait Schedule {
+pub trait Schedule: Send + Sync {
    fn run(&self, factors: &mut [BuiltinFactor], vars: &mut VarStore) -> ScheduleReport;
 }
@@ -8,7 +8,7 @@
 ///
 /// Must be `Ord + Copy` so slices can sort events, and `'static` so
 /// `History` can store it by value without lifetimes.
-pub trait Time: Copy + Ord + 'static {
+pub trait Time: Copy + Ord + Send + Sync + 'static {
    /// How much time elapsed between `self` and `later`.
    ///
    /// Used by `Drift<T>::variance_delta` to compute skill drift. Returning
@@ -7,6 +7,7 @@ use std::collections::HashMap;
 use crate::{
    Index, N_INF,
    arena::ScratchArena,
    color_group::ColorGroups,
    drift::Drift,
    game::Game,
    gaussian::Gaussian,
@@ -84,6 +85,12 @@ pub(crate) struct Event {
 }
 impl Event {
    pub(crate) fn iter_agents(&self) -> impl Iterator<Item = Index> + '_ {
        self.teams
            .iter()
            .flat_map(|t| t.items.iter().map(|it| it.agent))
    }
    fn outputs(&self) -> Vec<f64> {
        self.teams
            .iter()
@@ -108,6 +115,33 @@ impl Event {
            })
            .collect::<Vec<_>>()
    }
    /// Direct in-loop update: mutates self and `skills` inline with no
    /// intermediate allocation. Used by both the sequential sweep path and,
    /// via unsafe, by the parallel rayon path for events in the same color
    /// group (which have disjoint agent sets — see `sweep_color_groups`).
    fn iteration_direct<T: Time, D: Drift<T>>(
        &mut self,
        skills: &mut SkillStore,
        agents: &CompetitorStore<T, D>,
        p_draw: f64,
        arena: &mut ScratchArena,
    ) {
        let teams = self.within_priors(false, false, skills, agents);
        let result = self.outputs();
        let g = Game::ranked_with_arena(teams, &result, &self.weights, p_draw, arena);
        for (t, team) in self.teams.iter_mut().enumerate() {
            for (i, item) in team.items.iter_mut().enumerate() {
                let old_likelihood = skills.get(item.agent).unwrap().likelihood;
                let new_likelihood = (old_likelihood / item.likelihood) * g.likelihoods[t][i];
                skills.get_mut(item.agent).unwrap().likelihood = new_likelihood;
                item.likelihood = g.likelihoods[t][i];
            }
        }
        self.evidence = g.evidence;
    }
 }
 #[derive(Debug)]
@@ -117,6 +151,7 @@ pub struct TimeSlice<T: Time = i64> {
    pub(crate) time: T,
    p_draw: f64,
    arena: ScratchArena,
    pub(crate) color_groups: ColorGroups,
 }
 impl<T: Time> TimeSlice<T> {
@@ -127,9 +162,44 @@ impl<T: Time> TimeSlice<T> {
            time,
            p_draw,
            arena: ScratchArena::new(),
            color_groups: ColorGroups::new(),
        }
    }
    /// Recompute the color-group partition and reorder `self.events` into
    /// color-contiguous ranges. After this call, `self.color_groups.groups[c]`
    /// contains a contiguous ascending range of indices in `self.events`.
    pub(crate) fn recompute_color_groups(&mut self) {
        use crate::color_group::color_greedy;
        let n = self.events.len();
        if n == 0 {
            self.color_groups = ColorGroups::new();
            return;
        }
        let cg = color_greedy(n, |ev_idx| {
            self.events[ev_idx].iter_agents().collect::<Vec<_>>()
        });
        let mut reordered: Vec<Event> = Vec::with_capacity(n);
        let mut new_groups: Vec<Vec<usize>> = Vec::with_capacity(cg.groups.len());
        let mut taken: Vec<Option<Event>> = self.events.drain(..).map(Some).collect();
        for group in &cg.groups {
            let mut new_indices: Vec<usize> = Vec::with_capacity(group.len());
            for &old_idx in group {
                let ev = taken[old_idx].take().expect("event already taken");
                new_indices.push(reordered.len());
                reordered.push(ev);
            }
            new_groups.push(new_indices);
        }
        self.events = reordered;
        self.color_groups = ColorGroups { groups: new_groups };
    }
    pub fn add_events<D: Drift<T>>(
        &mut self,
        composition: Vec<Vec<Vec<Index>>>,
@@ -212,6 +282,7 @@ impl<T: Time> TimeSlice<T> {
        self.events.extend(events);
        self.iteration(from, agents);
        self.recompute_color_groups();
    }
    pub(crate) fn posteriors(&self) -> HashMap<Index, Gaussian> {
@@ -222,6 +293,8 @@ impl<T: Time> TimeSlice<T> {
    }
    pub fn iteration<D: Drift<T>>(&mut self, from: usize, agents: &CompetitorStore<T, D>) {
        if from > 0 || self.color_groups.is_empty() {
            // Initial pass (add_events) or no color groups yet: simple sequential sweep.
            for event in self.events.iter_mut().skip(from) {
                let teams = event.within_priors(false, false, &self.skills, agents);
                let result = event.outputs();
@@ -237,7 +310,8 @@ impl<T: Time> TimeSlice<T> {
                for (t, team) in event.teams.iter_mut().enumerate() {
                    for (i, item) in team.items.iter_mut().enumerate() {
                        let old_likelihood = self.skills.get(item.agent).unwrap().likelihood;
-                    let new_likelihood = (old_likelihood / item.likelihood) * g.likelihoods[t][i];
+                        let new_likelihood =
                            (old_likelihood / item.likelihood) * g.likelihoods[t][i];
                        self.skills.get_mut(item.agent).unwrap().likelihood = new_likelihood;
                        item.likelihood = g.likelihoods[t][i];
                    }
@@ -245,6 +319,90 @@ impl<T: Time> TimeSlice<T> {
                event.evidence = g.evidence;
            }
        } else {
            self.sweep_color_groups(agents);
        }
    }
    /// Full event sweep using the color-group partition. Colors are processed
    /// sequentially; within each color the inner loop is parallel under rayon.
    ///
    /// Events within each color group touch disjoint agent sets (guaranteed by
    /// the greedy coloring). This lets each rayon thread write directly to its
    /// events' skill likelihoods without a deferred-apply step, matching the
    /// sequential path's allocation profile. The unsafe block is sound because:
    ///   1. `self.events[range]` and `self.skills` are separate fields → disjoint.
    ///   2. Events in the same color group access disjoint `Index` values in
    ///      `self.skills`, so concurrent writes land on different memory locations.
    ///   3. Each event only writes to its own items' likelihoods (no sharing).
    #[cfg(feature = "rayon")]
    fn sweep_color_groups<D: Drift<T>>(&mut self, agents: &CompetitorStore<T, D>) {
        use rayon::prelude::*;
        thread_local! {
            static ARENA: std::cell::RefCell<ScratchArena> =
                std::cell::RefCell::new(ScratchArena::new());
        }
        // Minimum color-group size to justify rayon's task-spawn overhead.
        // Below this threshold, process events sequentially to avoid regression
        // on small per-slice workloads.
        const RAYON_THRESHOLD: usize = 64;
        for color_idx in 0..self.color_groups.groups.len() {
            let group_len = self.color_groups.groups[color_idx].len();
            if group_len == 0 {
                continue;
            }
            let range = self.color_groups.color_range(color_idx);
            let p_draw = self.p_draw;
            if group_len >= RAYON_THRESHOLD {
                // Obtain a raw pointer from the unique `&mut self.skills` reference.
                // Casting back to `&mut` inside the closure is sound because:
                //   1. The pointer originates from a `&mut` — no aliasing with shared refs.
                //   2. Events in the same color group touch disjoint `Index` slots in the
                //      underlying Vec, so concurrent writes from different threads land on
                //      different memory locations — no data race.
                //   3. `self.events[range]` and `self.skills` are separate struct fields,
                //      so the borrow splits cleanly.
                let skills_addr: usize = (&mut self.skills as *mut SkillStore) as usize;
                self.events[range].par_iter_mut().for_each(move |ev| {
                    // SAFETY: see above.
                    let skills: &mut SkillStore = unsafe { &mut *(skills_addr as *mut SkillStore) };
                    ARENA.with(|cell| {
                        let mut arena = cell.borrow_mut();
                        arena.reset();
                        ev.iteration_direct(skills, agents, p_draw, &mut arena);
                    });
                });
            } else {
                for ev in &mut self.events[range] {
                    ev.iteration_direct(&mut self.skills, agents, p_draw, &mut self.arena);
                }
            }
        }
    }
    /// Full event sweep using the color-group partition, sequential direct-write path.
    /// Events within each color group are updated inline — no EventOutput allocation —
    /// matching the T2 performance profile.
    #[cfg(not(feature = "rayon"))]
    fn sweep_color_groups<D: Drift<T>>(&mut self, agents: &CompetitorStore<T, D>) {
        for color_idx in 0..self.color_groups.groups.len() {
            if self.color_groups.groups[color_idx].is_empty() {
                continue;
            }
            let range = self.color_groups.color_range(color_idx);
            // Borrow self.events as a mutable slice for this color range.
            // self.skills and self.arena are separate fields — disjoint borrows are
            // allowed within a single method body.
            let p_draw = self.p_draw;
            for ev in &mut self.events[range] {
                ev.iteration_direct(&mut self.skills, agents, p_draw, &mut self.arena);
            }
        }
    }
    #[allow(dead_code)]
@@ -662,4 +820,67 @@ mod tests {
            epsilon = 1e-6
        );
    }
    #[test]
    fn time_slice_color_groups_reorders_events() {
        // ev0: [a, b]; ev1: [c, d]; ev2: [a, c]
        // Greedy coloring: ev0→c0, ev1→c0 (disjoint), ev2→c1 (overlaps both).
        // After recompute_color_groups, physical order is [ev0, ev1, ev2]
        // and groups == [[0, 1], [2]].
        let mut index_map = KeyTable::new();
        let a = index_map.get_or_create("a");
        let b = index_map.get_or_create("b");
        let c = index_map.get_or_create("c");
        let d = index_map.get_or_create("d");
        let mut agents: CompetitorStore<i64, ConstantDrift> = CompetitorStore::new();
        for agent in [a, b, c, d] {
            agents.insert(
                agent,
                Competitor {
                    rating: Rating::new(
                        Gaussian::from_ms(25.0, 25.0 / 3.0),
                        25.0 / 6.0,
                        ConstantDrift(25.0 / 300.0),
                    ),
                    ..Default::default()
                },
            );
        }
        let mut ts = TimeSlice::new(0i64, 0.0);
        ts.add_events(
            vec![
                vec![vec![a], vec![b]],
                vec![vec![c], vec![d]],
                vec![vec![a], vec![c]],
            ],
            vec![vec![1.0, 0.0], vec![1.0, 0.0], vec![1.0, 0.0]],
            vec![],
            &agents,
        );
        assert_eq!(ts.color_groups.n_colors(), 2);
        assert_eq!(ts.color_groups.groups[0], vec![0, 1]);
        assert_eq!(ts.color_groups.groups[1], vec![2]);
        assert_eq!(ts.color_groups.color_range(0), 0..2);
        assert_eq!(ts.color_groups.color_range(1), 2..3);
        // Events at positions 0 and 1 (color 0) must be disjoint — verify by
        // checking that the agent sets of self.events[0] and self.events[1] do
        // not include the agent at self.events[2].
        let agents_in_ev2: Vec<Index> = ts.events[2].iter_agents().collect();
        let agents_in_ev0: Vec<Index> = ts.events[0].iter_agents().collect();
        let agents_in_ev1: Vec<Index> = ts.events[1].iter_agents().collect();
        // ev0 and ev1 must be disjoint from each other (color-0 invariant).
        assert!(agents_in_ev0.iter().all(|ag| !agents_in_ev1.contains(ag)));
        // ev2 must share an agent with ev0 or ev1 (it needed its own color).
        let ev2_overlaps_ev0 = agents_in_ev2.iter().any(|ag| agents_in_ev0.contains(ag));
        let ev2_overlaps_ev1 = agents_in_ev2.iter().any(|ag| agents_in_ev1.contains(ag));
        assert!(ev2_overlaps_ev0 || ev2_overlaps_ev1);
    }
 }
@@ -0,0 +1,100 @@
 //! Determinism tests: identical posteriors across RAYON_NUM_THREADS
 //! values. Only compiled with the `rayon` feature.
 #![cfg(feature = "rayon")]
 use smallvec::smallvec;
 use trueskill_tt::{ConstantDrift, ConvergenceOptions, Event, History, Member, Outcome, Team};
 /// Build a deterministic workload using a simple LCG (no external rand crate).
 fn build_and_converge(seed: u64) -> Vec<(i64, trueskill_tt::Gaussian)> {
    let mut h = History::<i64, _, _, String>::builder_with_key()
        .mu(25.0)
        .sigma(25.0 / 3.0)
        .beta(25.0 / 6.0)
        .drift(ConstantDrift(25.0 / 300.0))
        .convergence(ConvergenceOptions {
            max_iter: 30,
            epsilon: 1e-6,
        })
        .build();
    // LCG for deterministic pseudo-random ints.
    let mut rng = seed;
    let mut next = || {
        rng = rng
            .wrapping_mul(6364136223846793005)
            .wrapping_add(1442695040888963407);
        rng
    };
    let mut events: Vec<Event<i64, String>> = Vec::with_capacity(200);
    for ev_i in 0..200 {
        let a = (next() % 40) as usize;
        let mut b = (next() % 40) as usize;
        while b == a {
            b = (next() % 40) as usize;
        }
        // ~10 events per slice so color groups have material parallelism.
        events.push(Event {
            time: (ev_i as i64 / 10) + 1,
            teams: smallvec![
                Team::with_members([Member::new(format!("p{a}"))]),
                Team::with_members([Member::new(format!("p{b}"))]),
            ],
            outcome: Outcome::winner((next() % 2) as u32, 2),
        });
    }
    h.add_events(events).unwrap();
    h.converge().unwrap();
    // Sample one competitor's curve for the comparison.
    h.learning_curve("p0")
 }
 #[test]
 fn posteriors_identical_across_thread_counts() {
    let sizes = [1usize, 2, 4, 8];
    let mut results: Vec<Vec<(i64, trueskill_tt::Gaussian)>> = Vec::new();
    for &n in &sizes {
        let pool = rayon::ThreadPoolBuilder::new()
            .num_threads(n)
            .build()
            .expect("rayon pool build");
        let curve = pool.install(|| build_and_converge(42));
        results.push(curve);
    }
    let reference = &results[0];
    for (i, curve) in results.iter().enumerate().skip(1) {
        assert_eq!(
            curve.len(),
            reference.len(),
            "curve length differs at {n} threads",
            n = sizes[i],
        );
        for (j, (&(t_ref, g_ref), &(t, g))) in reference.iter().zip(curve.iter()).enumerate() {
            assert_eq!(
                t_ref,
                t,
                "time point {j} differs at {n} threads: ref={t_ref} vs got={t}",
                n = sizes[i],
            );
            assert_eq!(
                g_ref.mu().to_bits(),
                g.mu().to_bits(),
                "mu bits differ at {n} threads, time {t}: ref={ref_mu} got={got_mu}",
                n = sizes[i],
                ref_mu = g_ref.mu(),
                got_mu = g.mu(),
            );
            assert_eq!(
                g_ref.sigma().to_bits(),
                g.sigma().to_bits(),
                "sigma bits differ at {n} threads, time {t}: ref={ref_sigma} got={got_sigma}",
                n = sizes[i],
                ref_sigma = g_ref.sigma(),
                got_sigma = g.sigma(),
            );
        }
    }
 }