A lot of progress.

2020-02-20 09:19:11 +01:00
parent fd249da405
commit 7528b3b67b
18 changed files with 2191 additions and 110 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,4 +5,13 @@ authors = ["Anders Olsson <anders.e.olsson@gmail.com>"]
 edition = "2018"

 [dependencies]
+expm = "0.1"
 ndarray = "0.13"
+ndarray-linalg = { version = "0.12" }
+cblas = "0.2"
+lapacke = "0.2"
+statrs = "0.10"
+ordered-float = "1.0"
+rand = "0.6"
+rand_xoshiro = "0.1"
+openblas-src = { version = "0.8", features = ["static"] }
--- a/build.sh
+++ b/build.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+export LIBRARY_PATH="$LIBRARY_PATH:/usr/local/gfortran/lib"
+
+cargo run --example basic
--- a/examples/basic.rs
+++ b/examples/basic.rs
@@ -1,3 +1,5 @@
+extern crate openblas_src;
+
 use kickscore as ks;

 fn main() {
--- a/src/condest.rs
+++ b/src/condest.rs
@@ -0,0 +1,973 @@
+//! This crate implements the matrix 1-norm estimator by [Higham and Tisseur].
+//!
+//! [Higham and Tisseur]: http://eprints.ma.man.ac.uk/321/1/covered/MIMS_ep2006_145.pdf
+use std::cmp;
+use std::collections::BTreeSet;
+use std::slice;
+
+use ndarray::{prelude::*, s, ArrayBase, Data, DataMut, Dimension, Ix1, Ix2};
+use ordered_float::NotNan;
+use rand::{thread_rng, Rng, SeedableRng};
+use rand_xoshiro::Xoshiro256StarStar;
+
+pub struct Normest1 {
+    n: usize,
+    t: usize,
+    rng: Xoshiro256StarStar,
+    x_matrix: Array2<f64>,
+    y_matrix: Array2<f64>,
+    z_matrix: Array2<f64>,
+    w_vector: Array1<f64>,
+    sign_matrix: Array2<f64>,
+    sign_matrix_old: Array2<f64>,
+    column_is_parallel: Vec<bool>,
+    indices: Vec<usize>,
+    indices_history: BTreeSet<usize>,
+    h: Vec<NotNan<f64>>,
+}
+
+/// A trait to generalize over 1-norm estimates of a matrix `A`, matrix powers `A^m`,
+/// or matrix products `A1 * A2 * ... * An`.
+///
+/// In the 1-norm estimator, one repeatedly constructs a matrix-matrix product between some n×n
+/// matrix X and some other n×t matrix Y. If one wanted to estimate the 1-norm of a matrix m times
+/// itself, X^m, it might thus be computationally less expensive to repeatedly apply
+/// X * ( * ( X ... ( X * Y ) rather than to calculate Z = X^m = X * X * ... * X and then apply Z *
+/// Y. In the first case, one has several matrix-matrix multiplications with complexity O(m*n*n*t),
+/// while in the latter case one has O(m*n*n*n) (plus one more O(n*n*t)).
+///
+/// So in case of t << n, it is cheaper to repeatedly apply matrix multiplication to the smaller
+/// matrix on the RHS, rather than to construct one definite matrix on the LHS.  Of course, this is
+/// modified by the number of iterations needed when performing the norm estimate, sustained
+/// performance of the matrix multiplication method used, etc.
+///
+/// It is at the designation of the user to check what is more efficient: to pass in one definite
+/// matrix or choose the alternative route described here.
+trait LinearOperator {
+    fn multiply_matrix<S>(
+        &self,
+        b: &mut ArrayBase<S, Ix2>,
+        c: &mut ArrayBase<S, Ix2>,
+        transpose: bool,
+    ) where
+        S: DataMut<Elem = f64>;
+}
+
+impl<S1> LinearOperator for ArrayBase<S1, Ix2>
+where
+    S1: Data<Elem = f64>,
+{
+    fn multiply_matrix<S2>(
+        &self,
+        b: &mut ArrayBase<S2, Ix2>,
+        c: &mut ArrayBase<S2, Ix2>,
+        transpose: bool,
+    ) where
+        S2: DataMut<Elem = f64>,
+    {
+        let (n_rows, n_cols) = self.dim();
+        assert_eq!(
+            n_rows, n_cols,
+            "Number of rows and columns does not match: `self` has to be a square matrix"
+        );
+        let n = n_rows;
+
+        let (b_n, b_t) = b.dim();
+        let (c_n, c_t) = b.dim();
+
+        assert_eq!(
+            n, b_n,
+            "Number of rows of b not equal to number of rows of `self`."
+        );
+        assert_eq!(
+            n, c_n,
+            "Number of rows of c not equal to number of rows of `self`."
+        );
+
+        assert_eq!(
+            b_t, c_t,
+            "Number of columns of b not equal to number of columns of c."
+        );
+
+        let t = b_t;
+
+        let (a_slice, a_layout) =
+            as_slice_with_layout(self).expect("Matrix `self` not contiguous.");
+        let (b_slice, b_layout) = as_slice_with_layout(b).expect("Matrix `b` not contiguous.");
+        let (c_slice, c_layout) = as_slice_with_layout_mut(c).expect("Matrix `c` not contiguous.");
+
+        assert_eq!(a_layout, b_layout);
+        assert_eq!(a_layout, c_layout);
+
+        let layout = a_layout;
+
+        let a_transpose = if transpose {
+            cblas::Transpose::Ordinary
+        } else {
+            cblas::Transpose::None
+        };
+
+        unsafe {
+            cblas::dgemm(
+                layout,
+                a_transpose,
+                cblas::Transpose::None,
+                n as i32,
+                t as i32,
+                n as i32,
+                1.0,
+                a_slice,
+                n as i32,
+                b_slice,
+                t as i32,
+                0.0,
+                c_slice,
+                t as i32,
+            )
+        }
+    }
+}
+
+impl<S1> LinearOperator for [&ArrayBase<S1, Ix2>]
+where
+    S1: Data<Elem = f64>,
+{
+    fn multiply_matrix<S2>(
+        &self,
+        b: &mut ArrayBase<S2, Ix2>,
+        c: &mut ArrayBase<S2, Ix2>,
+        transpose: bool,
+    ) where
+        S2: DataMut<Elem = f64>,
+    {
+        if self.len() > 0 {
+            let mut reversed;
+            let mut forward;
+
+            // TODO: Investigate, if an enum instead of a trait object might be more performant.
+            // This probably doesn't matter for large matrices, but could have a measurable impact
+            // on small ones.
+            let a_iter: &mut dyn DoubleEndedIterator<Item = _> = if transpose {
+                reversed = self.iter().rev();
+                &mut reversed
+            } else {
+                forward = self.iter();
+                &mut forward
+            };
+            let a = a_iter.next().unwrap(); // Ok because of if condition
+            a.multiply_matrix(b, c, transpose);
+
+            // NOTE: The swap in the loop body makes use of the fact that in all instances where
+            // `multiply_matrix` is used, the values potentially stored in `b` are not required
+            // anymore.
+            for a in a_iter {
+                std::mem::swap(b, c);
+                a.multiply_matrix(b, c, transpose);
+            }
+        }
+    }
+}
+
+impl<S1> LinearOperator for (&ArrayBase<S1, Ix2>, usize)
+where
+    S1: Data<Elem = f64>,
+{
+    fn multiply_matrix<S2>(
+        &self,
+        b: &mut ArrayBase<S2, Ix2>,
+        c: &mut ArrayBase<S2, Ix2>,
+        transpose: bool,
+    ) where
+        S2: DataMut<Elem = f64>,
+    {
+        let a = self.0;
+        let m = self.1;
+        if m > 0 {
+            a.multiply_matrix(b, c, transpose);
+            for _ in 1..m {
+                std::mem::swap(b, c);
+                self.0.multiply_matrix(b, c, transpose);
+            }
+        }
+    }
+}
+
+impl Normest1 {
+    pub fn new(n: usize, t: usize) -> Self {
+        assert!(
+            t <= n,
+            "Cannot have more iteration columns t than columns in the matrix."
+        );
+        let rng =
+            Xoshiro256StarStar::from_rng(&mut thread_rng()).expect("Rng initialization failed.");
+        let x_matrix = unsafe { Array2::<f64>::uninitialized((n, t)) };
+        let y_matrix = unsafe { Array2::<f64>::uninitialized((n, t)) };
+        let z_matrix = unsafe { Array2::<f64>::uninitialized((n, t)) };
+
+        let w_vector = unsafe { Array1::uninitialized(n) };
+
+        let sign_matrix = unsafe { Array2::<f64>::uninitialized((n, t)) };
+        let sign_matrix_old = unsafe { Array2::<f64>::uninitialized((n, t)) };
+
+        let column_is_parallel = vec![false; t];
+
+        let indices = (0..n).collect();
+        let indices_history = BTreeSet::new();
+
+        let h = vec![unsafe { NotNan::unchecked_new(0.0) }; n];
+
+        Normest1 {
+            n,
+            t,
+            rng,
+            x_matrix,
+            y_matrix,
+            z_matrix,
+            w_vector,
+            sign_matrix,
+            sign_matrix_old,
+            column_is_parallel,
+            indices,
+            indices_history,
+            h,
+        }
+    }
+
+    fn calculate<L>(&mut self, a_linear_operator: &L, itmax: usize) -> f64
+    where
+        L: LinearOperator + ?Sized,
+    {
+        assert!(itmax > 1, "normest1 is undefined for iterations itmax < 2");
+
+        // Explicitly empty the index history; all other quantities will be overwritten at some
+        // point.
+        self.indices_history.clear();
+
+        let n = self.n;
+        let t = self.t;
+
+        let sample = [-1., 1.0];
+
+        // “We now explain our choice of starting matrix. We take the first column of X to be the
+        // vector of 1s, which is the starting vector used in Algorithm 2.1. This has the advantage
+        // that for a matrix with nonnegative elements the algorithm converges with an exact estimate
+        // on the second iteration, and such matrices arise in applications, for example as a
+        // stochastic matrix or as the inverse of an M -matrix.”
+        //
+        // “The remaining columns are chosen as rand {− 1 , 1 } , with a check for and correction of
+        // parallel columns, exactly as for S in the body of the algorithm. We choose random vectors
+        // because it is difficult to argue for any particular fixed vectors and because randomness
+        // lessens the importance of counterexamples (see the comments in the next section).”
+        {
+            let rng_mut = &mut self.rng;
+            self.x_matrix
+                .mapv_inplace(|_| sample[rng_mut.gen_range(0, sample.len())]);
+            self.x_matrix.column_mut(0).fill(1.);
+        }
+
+        // Resample the x_matrix to make sure no columns are parallel
+        find_parallel_columns_in(
+            &self.x_matrix,
+            &mut self.y_matrix,
+            &mut self.column_is_parallel,
+        );
+        for (i, is_parallel) in self.column_is_parallel.iter().enumerate() {
+            if *is_parallel {
+                resample_column(&mut self.x_matrix, i, &mut self.rng, &sample);
+            }
+        }
+
+        // Set all columns to unit vectors
+        self.x_matrix.mapv_inplace(|x| x / n as f64);
+
+        let mut estimate = 0.0;
+        let mut best_index = 0;
+
+        'optimization_loop: for k in 0..itmax {
+            // Y = A X
+            a_linear_operator.multiply_matrix(&mut self.x_matrix, &mut self.y_matrix, false);
+
+            // est = max{‖Y(:,j)‖₁ : j = 1:t}
+            let (max_norm_index, max_norm) = matrix_onenorm_with_index(&self.y_matrix);
+
+            // if est > est_old or k=2
+            if max_norm > estimate || k == 1 {
+                // ind_best = indⱼ where est = ‖Y(:,j)‖₁, w = Y(:, ind_best)
+                estimate = max_norm;
+                best_index = self.indices[max_norm_index];
+                self.w_vector.assign(&self.y_matrix.column(max_norm_index));
+            } else if k > 1 && max_norm <= estimate {
+                break 'optimization_loop;
+            }
+
+            if k >= itmax {
+                break 'optimization_loop;
+            }
+
+            // S = sign(Y)
+            assign_signum_of_array(&self.y_matrix, &mut self.sign_matrix);
+
+            // TODO: Combine the test checking for parallelity between _all_ columns between S
+            // and S_old with the “if t > 1” test below.
+            //
+            // > If every column of S is parallel to a column of Sold, goto (6), end
+            //
+            // NOTE: We are reusing `y_matrix` here as a temporary value.
+            if are_all_columns_parallel_between(
+                &self.sign_matrix_old,
+                &self.sign_matrix,
+                &mut self.y_matrix,
+            ) {
+                break 'optimization_loop;
+            }
+
+            // FIXME: Is an explicit if condition here necessary?
+            if t > 1 {
+                // > Ensure that no column of S is parallel to another column of S
+                // > or to a column of Sold by replacing columns of S by rand{-1,+1}
+                //
+                // NOTE: We are reusing `y_matrix` here as a temporary value.
+                resample_parallel_columns(
+                    &mut self.sign_matrix,
+                    &self.sign_matrix_old,
+                    &mut self.y_matrix,
+                    &mut self.column_is_parallel,
+                    &mut self.rng,
+                    &sample,
+                );
+            }
+
+            // > est_old = est, Sold = S
+            // NOTE: Other than in the original algorithm, we store the sign matrix at this point
+            // already. This way, we can reuse the sign matrix as additional workspace which is
+            // useful when performing matrix multiplication with A^m or A1 A2 ... An (see the
+            // description of the LinearOperator trait for explanation).
+            //
+            // NOTE: We don't “save” the old estimate, because we are using max_norm as another name
+            // for the new estimate instead of overwriting/reusing est.
+            self.sign_matrix_old.assign(&self.sign_matrix);
+
+            // Z = A^T S
+            a_linear_operator.multiply_matrix(&mut self.sign_matrix, &mut self.z_matrix, true);
+
+            // hᵢ= ‖Z(i,:)‖_∞
+            let mut max_h = 0.0;
+            for (row, h_element) in self.z_matrix.genrows().into_iter().zip(self.h.iter_mut()) {
+                let h = vector_maxnorm(&row);
+                max_h = if h > max_h { h } else { max_h };
+                // Convert f64 to NotNan for using sort_unstable_by below
+                *h_element = h.into();
+            }
+
+            // TODO: This test for equality needs an approximate equality test instead.
+            if k > 0 && max_h == self.h[best_index].into() {
+                break 'optimization_loop;
+            }
+
+            // > Sort h so that h_1 >= ... >= h_n and re-order correspondingly.
+            // NOTE: h itself doesn't need to be reordered. Only the order of
+            // the indices is relevant.
+            {
+                let h_ref = &self.h;
+                self.indices
+                    .sort_unstable_by(|i, j| h_ref[*j].cmp(&h_ref[*i]));
+            }
+
+            self.x_matrix.fill(0.0);
+            if t > 1 {
+                // > Replace ind(1:t) by the first t indices in ind(1:n) that are not in ind_hist.
+                //
+                // > X(:, j) = e_ind_j, j = 1:t
+                //
+                // > ind_hist = [ind_hist ind(1:t)]
+                //
+                // NOTE: It's not actually needed to operate on the `indices` vector. What's important
+                // is that the history of indices, `indices_history`, gets updated with visited indices,
+                // and that each column of `x_matrix` is assigned that unit vector that is defined by the
+                // respective index.
+                //
+                // If so many indices have already been used that `n_cols - indices_history.len() < t`
+                // (which means that we have less than `t` unused indices remaining), we have to use a few
+                // historical indices when filling up the columns in `x_matrix`. For that, we put the
+                // historical indices after the fresh indices, but otherwise keep the order induced by `h`
+                // above.
+                let fresh_indices = cmp::min(t, n - self.indices_history.len());
+                if fresh_indices == 0 {
+                    break 'optimization_loop;
+                }
+                let mut current_column_fresh = 0;
+                let mut current_column_historical = fresh_indices;
+                let mut index_iterator = self.indices.iter();
+
+                let mut all_first_t_in_history = true;
+                // First, iterate over the first t sorted indices.
+                for i in (&mut index_iterator).take(t) {
+                    if !self.indices_history.contains(i) {
+                        all_first_t_in_history = false;
+                        self.x_matrix[(*i, current_column_fresh)] = 1.0;
+                        current_column_fresh += 1;
+                        self.indices_history.insert(*i);
+                    } else if current_column_historical < t {
+                        self.x_matrix[(*i, current_column_historical)] = 1.0;
+                        current_column_historical += 1;
+                    }
+                }
+
+                // > if ind(1:t) is contained in ind_hist, goto (6), end
+                if all_first_t_in_history {
+                    break 'optimization_loop;
+                }
+
+                // Iterate over the remaining indices
+                'fill_x: for i in index_iterator {
+                    if current_column_fresh >= t {
+                        break 'fill_x;
+                    }
+                    if !self.indices_history.contains(i) {
+                        self.x_matrix[(*i, current_column_fresh)] = 1.0;
+                        current_column_fresh += 1;
+                        self.indices_history.insert(*i);
+                    } else if current_column_historical < t {
+                        self.x_matrix[(*i, current_column_historical)] = 1.0;
+                        current_column_historical += 1;
+                    }
+                }
+            }
+        }
+
+        estimate
+    }
+
+    /// Estimate the 1-norm of matrix `a` using up to `itmax` iterations.
+    pub fn normest1<S>(&mut self, a: &ArrayBase<S, Ix2>, itmax: usize) -> f64
+    where
+        S: Data<Elem = f64>,
+    {
+        self.calculate(a, itmax)
+    }
+
+    /// Estimate the 1-norm of a marix `a` to the power `m` up to `itmax` iterations.
+    pub fn normest1_pow<S>(&mut self, a: &ArrayBase<S, Ix2>, m: usize, itmax: usize) -> f64
+    where
+        S: Data<Elem = f64>,
+    {
+        self.calculate(&(a, m), itmax)
+    }
+
+    /// Estimate the 1-norm of a product of matrices `a1 a2 ... an` up to `itmax` iterations.
+    pub fn normest1_prod<S>(&mut self, aprod: &[&ArrayBase<S, Ix2>], itmax: usize) -> f64
+    where
+        S: Data<Elem = f64>,
+    {
+        self.calculate(aprod, itmax)
+    }
+}
+
+/// Estimates the 1-norm of matrix `a`.
+///
+/// The parameter `t` is the number of vectors that have to fulfill some bound. See [Higham,
+/// Tisseur] for more information. `itmax` is the maximum number of sweeps permitted.
+///
+/// **NOTE:** This function allocates on every call. If you want to repeatedly estimate the
+/// 1-norm on matrices of the same size, construct a [`Normest1`] first, and call its methods.
+///
+/// [Higham, Tisseur]: http://eprints.ma.man.ac.uk/321/1/covered/MIMS_ep2006_145.pdf
+/// [`Normest1`]: struct.Normest1.html
+pub fn normest1(a_matrix: &Array2<f64>, t: usize, itmax: usize) -> f64 {
+    // Assume the matrix is square and take the columns as n. If it's not square, the assertion in
+    // normest.calculate will fail.
+    let n = a_matrix.dim().1;
+    let mut normest1 = Normest1::new(n, t);
+    normest1.normest1(a_matrix, itmax)
+}
+
+/// Estimates the 1-norm of a matrix `a` to the power `m`, `a^m`.
+///
+/// The parameter `t` is the number of vectors that have to fulfill some bound. See [Higham,
+/// Tisseur] for more information. `itmax` is the maximum number of sweeps permitted.
+///
+/// **NOTE:** This function allocates on every call. If you want to repeatedly estimate the
+/// 1-norm on matrices of the same size, construct a [`Normest1`] first, and call its methods.
+///
+/// [Higham, Tisseur]: http://eprints.ma.man.ac.uk/321/1/covered/MIMS_ep2006_145.pdf
+pub fn normest1_pow(a_matrix: &Array2<f64>, m: usize, t: usize, itmax: usize) -> f64 {
+    // Assume the matrix is square and take the columns as n. If it's not square, the assertion in
+    // normest.calculate will fail.
+    let n = a_matrix.dim().1;
+    let mut normest1 = Normest1::new(n, t);
+    normest1.normest1_pow(a_matrix, m, itmax)
+}
+
+/// Estimates the 1-norm of a product of matrices `a1`, `a2`, ..., `an` passed in as a slice of
+/// references.
+///
+/// The parameter `t` is the number of vectors that have to fulfill some bound. See [Higham,
+/// Tisseur] for more information. `itmax` is the maximum number of sweeps permitted.
+///
+/// **NOTE:** This function allocates on every call. If you want to repeatedly estimate the
+/// 1-norm on matrices of the same size, construct a [`Normest1`] first, and call its methods.
+///
+/// [Higham, Tisseur]: http://eprints.ma.man.ac.uk/321/1/covered/MIMS_ep2006_145.pdf
+pub fn normest1_prod(a_matrices: &[&Array2<f64>], t: usize, itmax: usize) -> f64 {
+    assert!(a_matrices.len() > 0);
+    let n = a_matrices[0].dim().1;
+    let mut normest1 = Normest1::new(n, t);
+    normest1.normest1_prod(a_matrices, itmax)
+}
+
+/// Assigns the sign of matrix `a` to matrix `b`.
+///
+/// Panics if matrices `a` and `b` have different shape and strides, or if either underlying array is
+/// non-contiguous. This is to make sure that the iteration order over the matrices is the same.
+fn assign_signum_of_array<S1, S2, D>(a: &ArrayBase<S1, D>, b: &mut ArrayBase<S2, D>)
+where
+    S1: Data<Elem = f64>,
+    S2: DataMut<Elem = f64>,
+    D: Dimension,
+{
+    assert_eq!(a.strides(), b.strides());
+    let (a_slice, a_layout) = as_slice_with_layout(a).expect("Matrix `a` is not contiguous.");
+    let (b_slice, b_layout) = as_slice_with_layout_mut(b).expect("Matrix `b` is not contiguous.");
+    assert_eq!(a_layout, b_layout);
+
+    signum_of_slice(a_slice, b_slice);
+}
+
+fn signum_of_slice(source: &[f64], destination: &mut [f64]) {
+    for (s, d) in source.iter().zip(destination) {
+        *d = s.signum();
+    }
+}
+
+/// Calculate the onenorm of a vector (an `ArrayBase` with dimension `Ix1`).
+fn vector_onenorm<S>(a: &ArrayBase<S, Ix1>) -> f64
+where
+    S: Data<Elem = f64>,
+{
+    let stride = a.strides()[0];
+    assert!(stride >= 0);
+    let stride = stride as usize;
+    let n_elements = a.len();
+    let a_slice = {
+        let a = a.as_ptr();
+        let total_len = n_elements * stride;
+        unsafe { slice::from_raw_parts(a, total_len) }
+    };
+
+    unsafe { cblas::dasum(n_elements as i32, a_slice, stride as i32) }
+}
+
+/// Calculate the maximum norm of a vector (an `ArrayBase` with dimension `Ix1`).
+fn vector_maxnorm<S>(a: &ArrayBase<S, Ix1>) -> f64
+where
+    S: Data<Elem = f64>,
+{
+    let stride = a.strides()[0];
+    assert!(stride >= 0);
+    let stride = stride as usize;
+    let n_elements = a.len();
+    let a_slice = {
+        let a = a.as_ptr();
+        let total_len = n_elements * stride;
+        unsafe { slice::from_raw_parts(a, total_len) }
+    };
+
+    let idx = unsafe { cblas::idamax(n_elements as i32, a_slice, stride as i32) as usize };
+    f64::abs(a[idx])
+}
+
+// /// Calculate the onenorm of a matrix (an `ArrayBase` with dimension `Ix2`).
+// fn matrix_onenorm<S>(a: &ArrayBase<S, Ix2>) -> f64
+//     where S: Data<Elem=f64>,
+// {
+//     let (n_rows, n_cols) = a.dim();
+//     if let Some((a_slice, layout)) = as_slice_with_layout(a) {
+//         let layout = match layout {
+//             cblas::Layout::RowMajor => lapacke::Layout::RowMajor,
+//             cblas::Layout::ColumnMajor => lapacke::Layout::ColumnMajor,
+//         };
+//         unsafe {
+//             lapacke::dlange(
+//                 layout,
+//                 b'1',
+//                 n_rows as i32,
+//                 n_cols as i32,
+//                 a_slice,
+//                 n_rows as i32,
+//             )
+//         }
+//     // Fall through case for non-contiguous arrays.
+//     } else {
+//         a.gencolumns().into_iter()
+//             .fold(0.0, |max, column| {
+//                 let onenorm = column.fold(0.0, |acc, element| { acc + f64::abs(*element) });
+//                 if onenorm > max { onenorm } else { max }
+//         })
+//     }
+// }
+
+/// Returns the one-norm of a matrix `a` together with the index of that column for
+/// which the norm is maximal.
+fn matrix_onenorm_with_index<S>(a: &ArrayBase<S, Ix2>) -> (usize, f64)
+where
+    S: Data<Elem = f64>,
+{
+    let mut max_norm = 0.0;
+    let mut max_norm_index = 0;
+    for (i, column) in a.gencolumns().into_iter().enumerate() {
+        let norm = vector_onenorm(&column);
+        if norm > max_norm {
+            max_norm = norm;
+            max_norm_index = i;
+        }
+    }
+    (max_norm_index, max_norm)
+}
+
+/// Finds columns in the matrix `a` that are parallel to to some other column in `a`.
+///
+/// Assumes that all entries of `a` are either +1 or -1.
+///
+/// If column `j` of matrix `a` is parallel to some column `i`, `column_is_parallel[i]` is set to
+/// `true`. The matrix `c` is used as an intermediate value for the matrix product `a^t * a`.
+///
+/// This function does not reset `column_is_parallel` to `false`. Entries that are `true` will be
+/// assumed to be parallel and not checked.
+///
+/// Panics if arrays `a` and `c` don't have the same dimensions, or if the length of the slice
+/// `column_is_parallel` is not equal to the number of columns in `a`.
+fn find_parallel_columns_in<S1, S2>(
+    a: &ArrayBase<S1, Ix2>,
+    c: &mut ArrayBase<S2, Ix2>,
+    column_is_parallel: &mut [bool],
+) where
+    S1: Data<Elem = f64>,
+    S2: DataMut<Elem = f64>,
+{
+    let a_dim = a.dim();
+    let c_dim = c.dim();
+    assert_eq!(a_dim, c_dim);
+
+    let (n_rows, n_cols) = a_dim;
+
+    assert_eq!(column_is_parallel.len(), n_cols);
+    {
+        let (a_slice, a_layout) = as_slice_with_layout(a).expect("Matrix `a` is not contiguous.");
+        let (c_slice, c_layout) =
+            as_slice_with_layout_mut(c).expect("Matrix `c` is not contiguous.");
+        assert_eq!(a_layout, c_layout);
+        let layout = a_layout;
+
+        // NOTE: When calling the wrapped Fortran dsyrk subroutine with row major layout,
+        // cblas::*syrk changes `'U'` to `'L'` (`Upper` to `Lower`), and `'O'` to `'N'` (`Ordinary`
+        // to `None`). Different from `cblas::*gemm`, however, it does not automatically make sure
+        // that the other arguments are changed to make sense in a routine expecting column major
+        // order (in `cblas::*gemm`, this happens by flipping the matrices `a` and `b` as
+        // arguments).
+        //
+        // So while `cblas::dsyrk` changes transposition and the position of where the results are
+        // written to, it passes the other arguments on to the Fortran routine as is.
+        //
+        // For example, in case matrix `a` is a 4x2 matrix in column major order, and we want to
+        // perform the operation `a^T a` on it (resulting in a symmetric 2x2 matrix), we would pass
+        // TRANS='T', N=2 (order of c), K=4 (number of rows because of 'T'), LDA=4 (max(1,k)
+        // because of 'T'), LDC=2.
+        //
+        // But if `a` is in row major order and we want to perform the same operation, we pass
+        // TRANS='T' (gets translated to 'N'), N=2, K=2 (number of columns, because we 'T' -> 'N'),
+        // LDA=2 (max(1,n) because of 'N'), LDC=2.
+        //
+        // In other words, because of row major order, the Fortran routine actually sees our 4x2
+        // matrix as a 2x4 matrix, and if we want to calculate `a^T a`, `cblas::dsyrk` makes sure
+        // `'N'` is passed.
+        let (k, lda) = match layout {
+            cblas::Layout::ColumnMajor => (n_cols, n_rows),
+            cblas::Layout::RowMajor => (n_rows, n_cols),
+        };
+        unsafe {
+            cblas::dsyrk(
+                layout,
+                cblas::Part::Upper,
+                cblas::Transpose::Ordinary,
+                n_cols as i32,
+                k as i32,
+                1.0,
+                a_slice,
+                lda as i32,
+                0.0,
+                c_slice,
+                n_cols as i32,
+            );
+        }
+    }
+
+    // c is upper triangular and contains all pair-wise vector products:
+    //
+    // x x x x x
+    // . x x x x
+    // . . x x x
+    // . . . x x
+    // . . . . x
+
+    // Don't check more rows than we have columns
+    'rows: for (i, row) in c.genrows().into_iter().enumerate().take(n_cols) {
+        // Skip if the column is already found to be parallel or if we are checking
+        // the last column
+        if column_is_parallel[i] || i >= n_cols - 1 {
+            continue 'rows;
+        }
+        for (j, element) in row.slice(s![i + 1..]).iter().enumerate() {
+            // Check if the vectors are parallel or anti-parallel
+            if f64::abs(*element) == n_rows as f64 {
+                column_is_parallel[i + j + 1] = true;
+            }
+        }
+    }
+}
+
+/// Checks whether any columns of the matrix `a` are parallel to any columns of `b`.
+///
+/// Assumes that we have parallelity only if all entries of two columns `a` and `b` are either +1
+/// or -1.
+///
+/// `The matrix `c` is used as an intermediate value for the matrix product `a^t * b`.
+///
+/// `column_is_parallel[j]` is set to `true` if column `j` of matrix `a` is parallel to some column
+/// `i` of the matrix `b`,
+///
+/// This function does not reset `column_is_parallel` to `false`. Entries that are `true` will be
+/// assumed to be parallel and not checked.
+///
+/// Panics if arrays `a`, `b`, and `c` don't have the same dimensions, or if the length of the slice
+/// `column_is_parallel` is not equal to the number of columns in `a`.
+fn find_parallel_columns_between<S1, S2, S3>(
+    a: &ArrayBase<S1, Ix2>,
+    b: &ArrayBase<S2, Ix2>,
+    c: &mut ArrayBase<S3, Ix2>,
+    column_is_parallel: &mut [bool],
+) where
+    S1: Data<Elem = f64>,
+    S2: Data<Elem = f64>,
+    S3: DataMut<Elem = f64>,
+{
+    let a_dim = a.dim();
+    let b_dim = b.dim();
+    let c_dim = c.dim();
+    assert_eq!(a_dim, b_dim);
+    assert_eq!(a_dim, c_dim);
+
+    let (n_rows, n_cols) = a_dim;
+
+    assert_eq!(column_is_parallel.len(), n_cols);
+
+    // Extra scope, because c_slice needs to be dropped after the dgemm
+    {
+        let (a_slice, a_layout) = as_slice_with_layout(&a).expect("Matrix `a` not contiguous.");
+        let (b_slice, b_layout) = as_slice_with_layout(&b).expect("Matrix `b` not contiguous.");
+        let (c_slice, c_layout) = as_slice_with_layout_mut(c).expect("Matrix `c` not contiguous.");
+
+        assert_eq!(a_layout, b_layout);
+        assert_eq!(a_layout, c_layout);
+
+        let layout = a_layout;
+
+        unsafe {
+            cblas::dgemm(
+                layout,
+                cblas::Transpose::Ordinary,
+                cblas::Transpose::None,
+                n_cols as i32,
+                n_cols as i32,
+                n_rows as i32,
+                1.0,
+                a_slice,
+                n_cols as i32,
+                b_slice,
+                n_cols as i32,
+                0.0,
+                c_slice,
+                n_cols as i32,
+            );
+        }
+    }
+
+    // We are iterating over the rows because it's more memory efficient (for row-major array).  In
+    // terms of logic there is no difference: we simply check if the current column of a (that's
+    // the outer loop) is parallel to any column of b (inner loop). By iterating via columns we would check if
+    // any column of a is parallel to the, in that case, current column of b.
+    // TODO:  Implement for column major arrays.
+    'rows: for (i, row) in c.genrows().into_iter().enumerate().take(n_cols) {
+        // Skip if the column is already found to be parallel the last column.
+        if column_is_parallel[i] {
+            continue 'rows;
+        }
+        for element in row {
+            if f64::abs(*element) == n_rows as f64 {
+                column_is_parallel[i] = true;
+                continue 'rows;
+            }
+        }
+    }
+}
+
+/// Check if every column in `a` is parallel to some column in `b`.
+///
+/// Assumes that we have parallelity only if all entries of two columns `a` and `b` are either +1
+/// or -1.
+fn are_all_columns_parallel_between<S1, S2>(
+    a: &ArrayBase<S1, Ix2>,
+    b: &ArrayBase<S1, Ix2>,
+    c: &mut ArrayBase<S2, Ix2>,
+) -> bool
+where
+    S1: Data<Elem = f64>,
+    S2: DataMut<Elem = f64>,
+{
+    let a_dim = a.dim();
+    let b_dim = b.dim();
+    let c_dim = c.dim();
+    assert_eq!(a_dim, b_dim);
+    assert_eq!(a_dim, c_dim);
+
+    let (n_rows, n_cols) = a_dim;
+
+    // Extra scope, because c_slice needs to be dropped after the dgemm
+    {
+        let (a_slice, a_layout) = as_slice_with_layout(&a).expect("Matrix `a` not contiguous.");
+        let (b_slice, b_layout) = as_slice_with_layout(&b).expect("Matrix `b` not contiguous.");
+        let (c_slice, c_layout) = as_slice_with_layout_mut(c).expect("Matrix `c` not contiguous.");
+
+        assert_eq!(a_layout, b_layout);
+        assert_eq!(a_layout, c_layout);
+
+        let layout = a_layout;
+
+        unsafe {
+            cblas::dgemm(
+                layout,
+                cblas::Transpose::Ordinary,
+                cblas::Transpose::None,
+                n_cols as i32,
+                n_cols as i32,
+                n_rows as i32,
+                1.0,
+                a_slice,
+                n_cols as i32,
+                b_slice,
+                n_cols as i32,
+                0.0,
+                c_slice,
+                n_rows as i32,
+            );
+        }
+    }
+
+    // We are iterating over the rows because it's more memory efficient (for row-major array).  In
+    // terms of logic there is no difference: we simply check if a specific column of a is parallel
+    // to any column of b. By iterating via columns we would check if any column of a is parallel
+    // to a specific column of b.
+    'rows: for row in c.genrows() {
+        for element in row {
+            // If a parallel column was found, cut to the next one.
+            if f64::abs(*element) == n_rows as f64 {
+                continue 'rows;
+            }
+        }
+        // This return statement should only be reached if not a single column parallel to the
+        // current one was found.
+        return false;
+    }
+    true
+}
+
+/// Find parallel columns in matrix `a` and columns in `a` that are parallel to any columns in
+/// matrix `b`, and replace those with random vectors. Returns `true` if resampling has taken place.
+fn resample_parallel_columns<S1, S2, S3, R>(
+    a: &mut ArrayBase<S1, Ix2>,
+    b: &ArrayBase<S2, Ix2>,
+    c: &mut ArrayBase<S3, Ix2>,
+    column_is_parallel: &mut [bool],
+    rng: &mut R,
+    sample: &[f64],
+) -> bool
+where
+    S1: DataMut<Elem = f64>,
+    S2: Data<Elem = f64>,
+    S3: DataMut<Elem = f64>,
+    R: Rng,
+{
+    column_is_parallel.iter_mut().for_each(|x| {
+        *x = false;
+    });
+    find_parallel_columns_in(a, c, column_is_parallel);
+    find_parallel_columns_between(a, b, c, column_is_parallel);
+    let mut has_resampled = false;
+    for (i, is_parallel) in column_is_parallel.into_iter().enumerate() {
+        if *is_parallel {
+            resample_column(a, i, rng, sample);
+            has_resampled = true;
+        }
+    }
+    has_resampled
+}
+
+/// Resamples column `i` of matrix `a` with elements drawn from `sample` using `rng`.
+///
+/// Panics if `i` exceeds the number of columns in `a`.
+fn resample_column<R, S>(a: &mut ArrayBase<S, Ix2>, i: usize, rng: &mut R, sample: &[f64])
+where
+    S: DataMut<Elem = f64>,
+    R: Rng,
+{
+    assert!(
+        i < a.dim().1,
+        "Trying to resample column with index exceeding matrix dimensions"
+    );
+    assert!(sample.len() > 0);
+    a.column_mut(i)
+        .mapv_inplace(|_| sample[rng.gen_range(0, sample.len())]);
+}
+
+/// Returns slice and layout underlying an array `a`.
+fn as_slice_with_layout<S, T, D>(a: &ArrayBase<S, D>) -> Option<(&[T], cblas::Layout)>
+where
+    S: Data<Elem = T>,
+    D: Dimension,
+{
+    if let Some(a_slice) = a.as_slice() {
+        Some((a_slice, cblas::Layout::RowMajor))
+    } else if let Some(a_slice) = a.as_slice_memory_order() {
+        Some((a_slice, cblas::Layout::ColumnMajor))
+    } else {
+        None
+    }
+}
+
+/// Returns mutable slice and layout underlying an array `a`.
+fn as_slice_with_layout_mut<S, T, D>(a: &mut ArrayBase<S, D>) -> Option<(&mut [T], cblas::Layout)>
+where
+    S: DataMut<Elem = T>,
+    D: Dimension,
+{
+    if a.as_slice_mut().is_some() {
+        Some((a.as_slice_mut().unwrap(), cblas::Layout::RowMajor))
+    } else if a.as_slice_memory_order_mut().is_some() {
+        Some((
+            a.as_slice_memory_order_mut().unwrap(),
+            cblas::Layout::ColumnMajor,
+        ))
+    } else {
+        None
+    }
+    // XXX: The above is a workaround for Rust not having non-lexical lifetimes yet.
+    // More information here:
+    // http://smallcultfollowing.com/babysteps/blog/2016/04/27/non-lexical-lifetimes-introduction/#problem-case-3-conditional-control-flow-across-functions
+    //
+    // if let Some(slice) = a.as_slice_mut() {
+    //     Some((slice, cblas::Layout::RowMajor))
+    // } else if let Some(slice) = a.as_slice_memory_order_mut() {
+    //     Some((slice, cblas::Layout::ColumnMajor))
+    // } else {
+    //     None
+    // }
+}
--- a/src/expm.rs
+++ b/src/expm.rs
@@ -0,0 +1,781 @@
+/// This crate contains `expm`, an implementation of Algorithm 6.1 by [Al-Mohy, Higham] in the Rust
+/// programming language. It calculates the exponential of a matrix. See the linked paper for more
+/// information.
+///
+/// An important ingredient is `normest1`, Algorithm 2.4 in [Higham, Tisseur], which estimates
+/// the 1-norm of a matrix.
+///
+/// Furthermore, to fully understand the algorithm as described in the original paper, one has to
+/// understand that the factor $\lvert C_{2m+1} \rvert$ arises during the Padé approximation of the
+/// exponential function. The derivation is described in [Gautschi 2012], pp. 363--365, and the
+/// factor reads:
+///
+/// \begin{equation}
+/// C_{n,m} = (-1)^n \frac{n!m!}{(n+m)!(n+m+1)!},
+/// \end{equation}
+///
+/// or using only the diagonal elements, $m=n$:
+///
+/// \begin{equation}
+/// C_m = (-1)^m \frac{m!m!}{(2m)!(2m+1)!}
+/// \end{equation}
+///
+///
+/// [Al-Mohy, Higham]: http://eprints.ma.man.ac.uk/1300/1/covered/MIMS_ep2009_9.pdf
+/// [Higham, Tisseur]: http://eprints.ma.man.ac.uk/321/1/covered/MIMS_ep2006_145.pdf
+/// [Gautschi 2012]: https://doi.org/10.1007/978-0-8176-8259-0
+use ndarray::{self, prelude::*, Data, DataMut, Dimension, Zip};
+
+use crate::condest::Normest1;
+
+// Can we calculate these at compile time?
+const THETA_3: f64 = 1.495585217958292e-2;
+const THETA_5: f64 = 2.539398330063230e-1;
+const THETA_7: f64 = 9.504178996162932e-1;
+const THETA_9: f64 = 2.097847961257068e0;
+// const THETA_13: f64 = 5.371920351148152e0 // Alg 3.1
+const THETA_13: f64 = 4.25; // Alg 5.1
+
+const PADE_COEFF_3: [f64; 4] = [120., 60., 12., 1.];
+const PADE_COEFF_5: [f64; 6] = [30240., 15120., 3360., 420., 30., 1.];
+const PADE_COEFF_7: [f64; 8] = [
+    17297280., 8648640., 1995840., 277200., 25200., 1512., 56., 1.,
+];
+const PADE_COEFF_9: [f64; 10] = [
+    17643225600.,
+    8821612800.,
+    2075673600.,
+    302702400.,
+    30270240.,
+    2162160.,
+    110880.,
+    3960.,
+    90.,
+    1.,
+];
+const PADE_COEFF_13: [f64; 14] = [
+    64764752532480000.,
+    32382376266240000.,
+    7771770303897600.,
+    1187353796428800.,
+    129060195264000.,
+    10559470521600.,
+    670442572800.,
+    33522128640.,
+    1323241920.,
+    40840800.,
+    960960.,
+    16380.,
+    182.,
+    1.,
+];
+
+/// Calculates the of leading terms in the backward error function for the [m/m] Padé approximant
+/// to the exponential function, i.e. it calculates:
+///
+/// \begin{align}
+///     C_{2m+1} &= \frac{(m!)^2}{(2m)!(2m+1)!} \\
+///              &= \frac{1}{\binom{2m}{m} (2m+1)!}
+/// \end{align}
+///
+/// NOTE: Depending on the notation used in the scientific papers, the coefficient `C` is,
+/// confusingly, sometimes indexed `C_i` and sometimes `C_{2m+1}`. These essentially mean the same
+/// thing and is due to the power series expansion of the backward error function:
+///
+/// \begin{equation}
+///     h(x) = \sum^\infty_{i=2m+1} C_i x^i
+/// \end{equation}
+fn pade_error_coefficient(m: u64) -> f64 {
+    use statrs::function::factorial::{binomial, factorial};
+
+    return 1.0 / (binomial(2 * m, m) * factorial(2 * m + 1));
+}
+
+#[allow(non_camel_case_types)]
+struct PadeOrder_3;
+#[allow(non_camel_case_types)]
+struct PadeOrder_5;
+#[allow(non_camel_case_types)]
+struct PadeOrder_7;
+#[allow(non_camel_case_types)]
+struct PadeOrder_9;
+#[allow(non_camel_case_types)]
+struct PadeOrder_13;
+
+enum PadeOrders {
+    _3,
+    _5,
+    _7,
+    _9,
+    _13,
+}
+
+trait PadeOrder {
+    const ORDER: u64;
+
+    /// Return the coefficients arising in both the numerator as well as in the denominator of the
+    /// Padé approximant (they are the same, due to $p(x) = q(-x)$.
+    ///
+    /// TODO: This is a great usecase for const generics, returning &[u64; Self::ORDER] and
+    /// possibly calculating the values at compile time instead of hardcoding them.
+    /// Maybe possible once RFC 2000 lands? See the PR https://github.com/rust-lang/rust/pull/53645
+    fn coefficients() -> &'static [f64];
+
+    fn calculate_pade_sums<S1, S2, S3>(
+        a: &ArrayBase<S1, Ix2>,
+        a_powers: &[&ArrayBase<S1, Ix2>],
+        u: &mut ArrayBase<S2, Ix2>,
+        v: &mut ArrayBase<S3, Ix2>,
+        work: &mut ArrayBase<S2, Ix2>,
+    ) where
+        S1: Data<Elem = f64>,
+        S2: DataMut<Elem = f64>,
+        S3: DataMut<Elem = f64>;
+}
+
+macro_rules! impl_padeorder {
+    ($($ty:ty, $m:literal, $const_coeff:ident),+) => {
+
+$(
+
+impl PadeOrder for $ty {
+    const ORDER: u64 = $m;
+
+    fn coefficients() -> &'static [f64] {
+        &$const_coeff
+    }
+
+    fn calculate_pade_sums<S1, S2, S3>(
+        a: &ArrayBase<S1, Ix2>,
+        a_powers: &[&ArrayBase<S1, Ix2>],
+        u: &mut ArrayBase<S2, Ix2>,
+        v: &mut ArrayBase<S3, Ix2>,
+        work: &mut ArrayBase<S2, Ix2>,
+    )
+        where S1: Data<Elem=f64>,
+              S2: DataMut<Elem=f64>,
+              S3: DataMut<Elem=f64>,
+    {
+        assert_eq!(a_powers.len(), ($m - 1)/2 + 1);
+
+        let (n_rows, n_cols) = a.dim();
+        assert_eq!(n_rows, n_cols, "Pade sum only defined for square matrices.");
+        let n = n_rows as i32;
+
+        // Iterator to get 2 coefficients, c_{2i} and c_{2i+1}, and 1 matrix power at a time.
+        let mut iterator = Self::coefficients().chunks_exact(2).zip(a_powers.iter());
+
+        // First element from the iterator.
+        //
+        // NOTE: The unwrap() and unreachable!() are permissable because the assertion above
+        // ensures the validity.
+        //
+        // TODO: An optimization is probably to just set u and v to zero and only assign the
+        // coefficients to its diagonal, given that A_0 = A^0 = 1.
+        let (c_0, c_1, a_pow) = match iterator.next().unwrap() {
+            (&[c_0, c_1], a_pow) => (c_0, c_1, a_pow),
+            _ => unreachable!()
+        };
+
+        work.zip_mut_with(a_pow, |x, &y| *x = c_1 * y);
+        v.zip_mut_with(a_pow, |x, &y| *x = c_0 * y);
+
+        // Rest of the iterator
+        while let Some(item) = iterator.next() {
+            let (c_2k, c_2k1, a_pow) = match item {
+                (&[c_2k, c_2k1], a_pow) => (c_2k, c_2k1, a_pow),
+                _ => unreachable!()
+            };
+
+            work.zip_mut_with(a_pow, |x, &y| *x = *x + c_2k1 * y);
+            v.zip_mut_with(a_pow, |x, &y| *x = *x + c_2k * y);
+        }
+
+        let (a_slice, a_layout) = as_slice_with_layout(a).expect("Matrix `a` not contiguous.");
+        let (work_slice, _) = as_slice_with_layout(work).expect("Matrix `work` not contiguous.");
+        let (u_slice, u_layout) = as_slice_with_layout_mut(u).expect("Matrix `u` not contiguous.");
+        assert_eq!(a_layout, u_layout, "Memory layout mismatch between matrices; currently only row major matrices are supported.");
+        let layout = a_layout;
+        unsafe {
+            cblas::dgemm(
+                layout,
+                cblas::Transpose::None,
+                cblas::Transpose::None,
+                n,
+                n,
+                n,
+                1.0,
+                a_slice,
+                n,
+                work_slice,
+                n,
+                0.0,
+                u_slice,
+                n,
+            )
+        }
+    }
+}
+
+)+
+}
+}
+
+impl_padeorder!(
+    PadeOrder_3,
+    3,
+    PADE_COEFF_3,
+    PadeOrder_5,
+    5,
+    PADE_COEFF_5,
+    PadeOrder_7,
+    7,
+    PADE_COEFF_7,
+    PadeOrder_9,
+    9,
+    PADE_COEFF_9
+);
+
+impl PadeOrder for PadeOrder_13 {
+    const ORDER: u64 = 13;
+
+    fn coefficients() -> &'static [f64] {
+        &PADE_COEFF_13
+    }
+
+    fn calculate_pade_sums<S1, S2, S3>(
+        a: &ArrayBase<S1, Ix2>,
+        a_powers: &[&ArrayBase<S1, Ix2>],
+        u: &mut ArrayBase<S2, Ix2>,
+        v: &mut ArrayBase<S3, Ix2>,
+        work: &mut ArrayBase<S2, Ix2>,
+    ) where
+        S1: Data<Elem = f64>,
+        S2: DataMut<Elem = f64>,
+        S3: DataMut<Elem = f64>,
+    {
+        assert_eq!(a_powers.len(), (13 - 1) / 2 + 1);
+
+        let (n_rows, n_cols) = a.dim();
+        assert_eq!(n_rows, n_cols, "Pade sum only defined for square matrices.");
+        let n = n_rows;
+
+        let coefficients = Self::coefficients();
+
+        Zip::from(&mut *work)
+            .and(a_powers[0])
+            .and(a_powers[1])
+            .and(a_powers[2])
+            .and(a_powers[3])
+            .apply(|x, &a0, &a2, &a4, &a6| {
+                *x = *x
+                    + coefficients[1] * a0
+                    + coefficients[3] * a2
+                    + coefficients[5] * a4
+                    + coefficients[7] * a6;
+            });
+
+        {
+            let (a_slice, a_layout) = as_slice_with_layout(a).expect("Matrix `a` not contiguous.");
+            let (work_slice, _) =
+                as_slice_with_layout(work).expect("Matrix `work` not contiguous.");
+            let (u_slice, u_layout) =
+                as_slice_with_layout_mut(u).expect("Matrix `u` not contiguous.");
+            assert_eq!(a_layout, u_layout, "Memory layout mismatch between matrices; currently only row major matrices are supported.");
+            let layout = a_layout;
+            unsafe {
+                cblas::dgemm(
+                    layout,
+                    cblas::Transpose::None,
+                    cblas::Transpose::None,
+                    n as i32,
+                    n as i32,
+                    n as i32,
+                    1.0,
+                    a_slice,
+                    n as i32,
+                    work_slice,
+                    n as i32,
+                    0.0,
+                    u_slice,
+                    n as i32,
+                )
+            }
+        }
+
+        Zip::from(&mut *work)
+            .and(a_powers[1])
+            .and(a_powers[2])
+            .and(a_powers[3])
+            .apply(|x, &a2, &a4, &a6| {
+                *x = coefficients[8] * a2 + coefficients[10] * a4 + coefficients[12] * a6;
+            });
+
+        {
+            let (a6_slice, a6_layout) =
+                as_slice_with_layout(a_powers[3]).expect("Matrix `a6` not contiguous.");
+            let (work_slice, _) =
+                as_slice_with_layout(work).expect("Matrix `work` not contiguous.");
+            let (v_slice, v_layout) =
+                as_slice_with_layout_mut(v).expect("Matrix `v` not contiguous.");
+            assert_eq!(a6_layout, v_layout, "Memory layout mismatch between matrices; currently only row major matrices are supported.");
+            let layout = a6_layout;
+            unsafe {
+                cblas::dgemm(
+                    layout,
+                    cblas::Transpose::None,
+                    cblas::Transpose::None,
+                    n as i32,
+                    n as i32,
+                    n as i32,
+                    1.0,
+                    a6_slice,
+                    n as i32,
+                    work_slice,
+                    n as i32,
+                    0.0,
+                    v_slice,
+                    n as i32,
+                )
+            }
+        }
+
+        Zip::from(v)
+            .and(a_powers[0])
+            .and(a_powers[1])
+            .and(a_powers[2])
+            .and(a_powers[3])
+            .apply(|x, &a0, &a2, &a4, &a6| {
+                *x = *x
+                    + coefficients[0] * a0
+                    + coefficients[2] * a2
+                    + coefficients[4] * a4
+                    + coefficients[6] * a6;
+            })
+    }
+}
+
+/// Storage for calculating the matrix exponential.
+pub struct Expm {
+    n: usize,
+    itmax: usize,
+    eye: Array2<f64>,
+    a1: Array2<f64>,
+    a2: Array2<f64>,
+    a4: Array2<f64>,
+    a6: Array2<f64>,
+    a8: Array2<f64>,
+    a_abs: Array2<f64>,
+    u: Array2<f64>,
+    work: Array2<f64>,
+    pivot: Array1<i32>,
+    normest1: Normest1,
+    layout: cblas::Layout,
+}
+
+impl Expm {
+    /// Allocates all space to calculate the matrix exponential for a square matrix of dimension
+    /// n×n.
+    pub fn new(n: usize) -> Self {
+        let eye = Array2::<f64>::eye(n);
+        let a1 = Array2::<f64>::zeros((n, n));
+        let a2 = Array2::<f64>::zeros((n, n));
+        let a4 = Array2::<f64>::zeros((n, n));
+        let a6 = Array2::<f64>::zeros((n, n));
+        let a8 = Array2::<f64>::zeros((n, n));
+        let a_abs = Array2::<f64>::zeros((n, n));
+        let u = Array2::<f64>::zeros((n, n));
+        let work = Array2::<f64>::zeros((n, n));
+        let pivot = Array1::<i32>::zeros(n);
+        let layout = cblas::Layout::RowMajor;
+
+        // TODO: Investigate what an optimal value for t is when estimating the 1-norm.
+        // Python's SciPY uses t=2. Why?
+        let t = 2;
+        let itmax = 5;
+
+        let normest1 = Normest1::new(n, t);
+
+        Expm {
+            n,
+            itmax,
+            eye,
+            a1,
+            a2,
+            a4,
+            a6,
+            a8,
+            a_abs,
+            u,
+            work,
+            pivot,
+            normest1,
+            layout,
+        }
+    }
+
+    /// Calculate the matrix exponential of the n×n matrix `a` storing the result in matrix `b`.
+    ///
+    /// NOTE: Panics if input matrices `a` and `b` don't have matching dimensions, are not square,
+    /// not in row-major order, or don't have the same dimension as the `Expm` object `expm` is
+    /// called on.
+    pub fn expm<S1, S2>(&mut self, a: &ArrayBase<S1, Ix2>, b: &mut ArrayBase<S2, Ix2>)
+    where
+        S1: Data<Elem = f64>,
+        S2: DataMut<Elem = f64>,
+    {
+        assert_eq!(
+            a.dim(),
+            b.dim(),
+            "Input matrices `a` and `b` have to have matching dimensions."
+        );
+        let (n_rows, n_cols) = a.dim();
+        assert_eq!(
+            n_rows, n_cols,
+            "expm is only implemented for square matrices."
+        );
+        assert_eq!(
+            n_rows, self.n,
+            "Dimension mismatch between matrix `a` and preconfigured `Expm` struct."
+        );
+
+        // Rename b to v to be in line with the nomenclature of the original paper.
+        let v = b;
+
+        self.a1.assign(a);
+
+        let n = self.n as i32;
+
+        {
+            let (a_slice, a_layout) =
+                as_slice_with_layout(&self.a1).expect("Matrix `a` not contiguous.");
+            let (a2_slice, _) =
+                as_slice_with_layout_mut(&mut self.a2).expect("Matrix `a2` not contiguous.");
+            assert_eq!(a_layout, self.layout, "Memory layout mismatch between matrices; currently only row major matrices are supported.");
+            unsafe {
+                cblas::dgemm(
+                    self.layout,
+                    cblas::Transpose::None,
+                    cblas::Transpose::None,
+                    n,
+                    n,
+                    n,
+                    1.0,
+                    a_slice,
+                    n,
+                    a_slice,
+                    n,
+                    0.0,
+                    a2_slice,
+                    n as i32,
+                )
+            }
+        }
+
+        let d4_estimated = self
+            .normest1
+            .normest1_pow(&self.a2, 2, self.itmax)
+            .powf(1.0 / 4.0);
+        let d6_estimated = self
+            .normest1
+            .normest1_pow(&self.a2, 3, self.itmax)
+            .powf(1.0 / 6.0);
+        let eta_1 = d4_estimated.max(d6_estimated);
+
+        if eta_1 <= THETA_3 && self.ell(3) == 0 {
+            println!("eta_1 condition");
+            self.solve_via_pade(PadeOrders::_3, v);
+            return;
+        }
+
+        {
+            let (a2_slice, _) =
+                as_slice_with_layout(&self.a2).expect("Matrix `a2` not contiguous.");
+            let (a4_slice, _) =
+                as_slice_with_layout_mut(&mut self.a4).expect("Matrix `a4` not contiguous.");
+            unsafe {
+                cblas::dgemm(
+                    self.layout,
+                    cblas::Transpose::None,
+                    cblas::Transpose::None,
+                    self.n as i32,
+                    self.n as i32,
+                    self.n as i32,
+                    1.0,
+                    a2_slice,
+                    n as i32,
+                    a2_slice,
+                    n as i32,
+                    0.0,
+                    a4_slice,
+                    n as i32,
+                )
+            }
+        }
+
+        let d4_precise = self.normest1.normest1(&self.a4, self.itmax).powf(1.0 / 4.0);
+        let eta_2 = d4_precise.max(d6_estimated);
+
+        if eta_2 <= THETA_5 && self.ell(5) == 0 {
+            println!("eta_2 condition");
+            self.solve_via_pade(PadeOrders::_5, v);
+            return;
+        }
+
+        {
+            let (a2_slice, _) =
+                as_slice_with_layout(&self.a2).expect("Matrix `a2` not contiguous.");
+            let (a4_slice, _) =
+                as_slice_with_layout(&self.a4).expect("Matrix `a4` not contiguous.");
+            let (a6_slice, _) =
+                as_slice_with_layout_mut(&mut self.a6).expect("Matrix `a6` not contiguous.");
+            unsafe {
+                cblas::dgemm(
+                    self.layout,
+                    cblas::Transpose::None,
+                    cblas::Transpose::None,
+                    self.n as i32,
+                    self.n as i32,
+                    self.n as i32,
+                    1.0,
+                    a2_slice,
+                    n as i32,
+                    a4_slice,
+                    n as i32,
+                    0.0,
+                    a6_slice,
+                    n as i32,
+                )
+            }
+        }
+
+        let d6_precise = self.normest1.normest1(&self.a6, self.itmax).powf(1.0 / 6.0);
+        let d8_estimated = self.normest1.normest1_pow(&self.a4, 2, self.itmax);
+        let eta_3 = d6_precise.max(d8_estimated);
+
+        if eta_3 <= THETA_7 && self.ell(7) == 0 {
+            println!("eta_3 (first) condition");
+            self.solve_via_pade(PadeOrders::_7, v);
+            return;
+        }
+
+        {
+            let (a4_slice, _) =
+                as_slice_with_layout(&self.a4).expect("Matrix `a4` not contiguous.");
+            let (a8_slice, _) =
+                as_slice_with_layout_mut(&mut self.a8).expect("Matrix `a8` not contiguous.");
+            unsafe {
+                cblas::dgemm(
+                    self.layout,
+                    cblas::Transpose::None,
+                    cblas::Transpose::None,
+                    self.n as i32,
+                    self.n as i32,
+                    self.n as i32,
+                    1.0,
+                    a4_slice,
+                    n as i32,
+                    a4_slice,
+                    n as i32,
+                    0.0,
+                    a8_slice,
+                    n as i32,
+                )
+            }
+        }
+
+        if eta_3 <= THETA_9 && self.ell(9) == 0 {
+            println!("eta_3 (second) condition");
+            self.solve_via_pade(PadeOrders::_9, v);
+            return;
+        }
+
+        let eta_4 = d8_estimated.max(
+            self.normest1
+                .normest1_prod(&[&self.a4, &self.a6], self.itmax)
+                .powf(1.0 / 10.0),
+        );
+        let eta_5 = eta_3.min(eta_4);
+
+        use std::cmp;
+        use std::f64;
+        let mut s = cmp::max(f64::ceil(f64::log2(eta_5 / THETA_13)) as i32, 0);
+        self.a1.mapv_inplace(|x| x / 2f64.powi(s));
+        s = s + self.ell(13);
+        self.a1.zip_mut_with(a, |x, &y| *x = y / 2f64.powi(s));
+        self.a2.mapv_inplace(|x| x / 2f64.powi(2 * s));
+        self.a4.mapv_inplace(|x| x / 2f64.powi(4 * s));
+        self.a6.mapv_inplace(|x| x / 2f64.powi(6 * s));
+
+        self.solve_via_pade(PadeOrders::_13, v);
+
+        // TODO: Call code fragment 2.1 in the paper if `a` is triangular, instead of the code below.
+        //
+        // NOTE: it's guaranteed that s >= 0 by its definition.
+        let (u_slice, _) =
+            as_slice_with_layout_mut(&mut self.u).expect("Matrix `u` not contiguous.");
+
+        // NOTE: v initially contains r after `solve_via_pade`.
+        let (v_slice, _) = as_slice_with_layout_mut(v).expect("Matrix `v` not contiguous.");
+        for _ in 0..s {
+            unsafe {
+                cblas::dgemm(
+                    self.layout,
+                    cblas::Transpose::None,
+                    cblas::Transpose::None,
+                    self.n as i32,
+                    self.n as i32,
+                    self.n as i32,
+                    1.0,
+                    v_slice,
+                    n as i32,
+                    v_slice,
+                    n as i32,
+                    0.0,
+                    u_slice,
+                    n as i32,
+                )
+            }
+
+            u_slice.swap_with_slice(v_slice);
+        }
+    }
+
+    /// A helper function (as it is called in the original paper) returning the
+    /// $\max(\lceil \log_2(\alpha/u) / 2m \rceil, 0)$, where
+    /// $\alpha = \lvert c_{2m+1}\rvert \texttt{normest}(\lvert A\rvert^{2m+1})/\lVertA\rVert_1$.
+    fn ell(&mut self, m: usize) -> i32 {
+        Zip::from(&mut self.a_abs)
+            .and(&self.a1)
+            .apply(|x, &y| *x = y.abs());
+
+        let c2m1 = pade_error_coefficient(m as u64);
+
+        let norm_abs_a_2m1 = self
+            .normest1
+            .normest1_pow(&self.a_abs, 2 * m + 1, self.itmax);
+        let norm_a = self.normest1.normest1(&self.a1, self.itmax);
+        let alpha = c2m1.abs() * norm_abs_a_2m1 / norm_a;
+
+        // The unit roundoff, defined as half the machine epsilon.
+        let u = std::f64::EPSILON / 2.0;
+
+        use std::cmp;
+        use std::f64;
+
+        cmp::max(0, f64::ceil(f64::log2(alpha / u) / (2 * m) as f64) as i32)
+    }
+
+    fn solve_via_pade<S>(&mut self, pade_order: PadeOrders, v: &mut ArrayBase<S, Ix2>)
+    where
+        S: DataMut<Elem = f64>,
+    {
+        use PadeOrders::*;
+
+        macro_rules! pade {
+            ($order:ty, [$(&$apow:expr),+]) => {
+                <$order as PadeOrder>::calculate_pade_sums(&self.a1, &[$(&$apow),+], &mut self.u, v, &mut self.work);
+            }
+        }
+
+        match pade_order {
+            _3 => pade!(PadeOrder_3, [&self.eye, &self.a2]),
+            _5 => pade!(PadeOrder_5, [&self.eye, &self.a2, &self.a4]),
+            _7 => pade!(PadeOrder_7, [&self.eye, &self.a2, &self.a4, &self.a6]),
+            _9 => pade!(
+                PadeOrder_9,
+                [&self.eye, &self.a2, &self.a4, &self.a6, &self.a8]
+            ),
+            _13 => pade!(PadeOrder_13, [&self.eye, &self.a2, &self.a4, &self.a6]),
+        };
+
+        // Here we set v = p <- u + v and u = q <- -u + v, overwriting u and v via work.
+        self.work.assign(v);
+
+        Zip::from(&mut *v).and(&self.u).apply(|x, &y| {
+            *x = *x + y;
+        });
+
+        Zip::from(&mut self.u).and(&self.work).apply(|x, &y| {
+            *x = -*x + y;
+        });
+
+        let (u_slice, _) =
+            as_slice_with_layout_mut(&mut self.u).expect("Matrix `u` not contiguous.");
+        let (v_slice, _) = as_slice_with_layout_mut(v).expect("Matrix `v` not contiguous.");
+        let (pivot_slice, _) =
+            as_slice_with_layout_mut(&mut self.pivot).expect("Vector `pivot` not contiguous.");
+
+        let n = self.n as i32;
+
+        let layout = {
+            match self.layout {
+                cblas::Layout::ColumnMajor => lapacke::Layout::ColumnMajor,
+                cblas::Layout::RowMajor => lapacke::Layout::RowMajor,
+            }
+        };
+
+        // FIXME: Handle the info for error management.
+        let _ = unsafe { lapacke::dgesv(layout, n, n, u_slice, n, pivot_slice, v_slice, n) };
+    }
+}
+
+/// Calculate the matrix exponential of the n×n matrix `a` storing the result in matrix `b`.
+///
+/// NOTE: Panics if input matrices `a` and `b` don't have matching dimensions, are not square,
+/// not in row-major order, or don't have the same dimension as the `Expm` object `expm` is
+/// called on.
+pub fn expm<S1, S2>(a: &ArrayBase<S1, Ix2>, b: &mut ArrayBase<S2, Ix2>)
+where
+    S1: Data<Elem = f64>,
+    S2: DataMut<Elem = f64>,
+{
+    let (n, _) = a.dim();
+
+    let mut expm = Expm::new(n);
+    expm.expm(a, b);
+}
+
+/// Returns slice and layout underlying an array `a`.
+fn as_slice_with_layout<S, T, D>(a: &ArrayBase<S, D>) -> Option<(&[T], cblas::Layout)>
+where
+    S: Data<Elem = T>,
+    D: Dimension,
+{
+    if let Some(a_slice) = a.as_slice() {
+        Some((a_slice, cblas::Layout::RowMajor))
+    } else if let Some(a_slice) = a.as_slice_memory_order() {
+        Some((a_slice, cblas::Layout::ColumnMajor))
+    } else {
+        None
+    }
+}
+
+/// Returns mutable slice and layout underlying an array `a`.
+fn as_slice_with_layout_mut<S, T, D>(a: &mut ArrayBase<S, D>) -> Option<(&mut [T], cblas::Layout)>
+where
+    S: DataMut<Elem = T>,
+    D: Dimension,
+{
+    if a.as_slice_mut().is_some() {
+        Some((a.as_slice_mut().unwrap(), cblas::Layout::RowMajor))
+    } else if a.as_slice_memory_order_mut().is_some() {
+        Some((
+            a.as_slice_memory_order_mut().unwrap(),
+            cblas::Layout::ColumnMajor,
+        ))
+    } else {
+        None
+    }
+    // XXX: The above is a workaround for Rust not having non-lexical lifetimes yet.
+    // More information here:
+    // http://smallcultfollowing.com/babysteps/blog/2016/04/27/non-lexical-lifetimes-introduction/#problem-case-3-conditional-control-flow-across-functions
+    //
+    // if let Some(slice) = a.as_slice_mut() {
+    //     Some((slice, cblas::Layout::RowMajor))
+    // } else if let Some(slice) = a.as_slice_memory_order_mut() {
+    //     Some((slice, cblas::Layout::ColumnMajor))
+    // } else {
+    //     None
+    // }
+}
--- a/src/fitter.rs
+++ b/src/fitter.rs
@@ -1,11 +1,21 @@
-mod batch;
 mod recursive;

-pub use batch::BatchFitter;
 pub use recursive::RecursiveFitter;

 pub trait Fitter {
    fn add_sample(&mut self, t: f64) -> usize;
    fn allocate(&mut self);
    fn fit(&mut self);
+
+    fn vs(&self, idx: usize) -> f64;
+    fn vs_mut(&mut self, idx: usize) -> &mut f64;
+
+    fn ms(&self, idx: usize) -> f64;
+    fn ms_mut(&mut self, idx: usize) -> &mut f64;
+
+    fn xs(&self, idx: usize) -> f64;
+    fn xs_mut(&mut self, idx: usize) -> &mut f64;
+
+    fn ns(&self, idx: usize) -> f64;
+    fn ns_mut(&mut self, idx: usize) -> &mut f64;
 }
--- a/src/fitter/batch.rs
+++ b/src/fitter/batch.rs
@@ -1,62 +0,0 @@
-use crate::kernel::Kernel;
-
-use super::Fitter;
-
-pub struct BatchFitter {
-    ts_new: Vec<f64>,
-    kernel: Box<dyn Kernel>,
-    ts: Vec<usize>,
-    ms: Vec<usize>,
-    vs: Vec<usize>,
-    ns: Vec<usize>,
-    xs: Vec<usize>,
-    is_fitted: bool,
-}
-
-impl BatchFitter {
-    pub fn new(kernel: Box<dyn Kernel>) -> Self {
-        BatchFitter {
-            ts_new: Vec::new(),
-            kernel,
-            ts: Vec::new(),
-            ms: Vec::new(),
-            vs: Vec::new(),
-            ns: Vec::new(),
-            xs: Vec::new(),
-            is_fitted: true,
-        }
-    }
-}
-
-impl Fitter for BatchFitter {
-    fn add_sample(&mut self, t: f64) -> usize {
-        let idx = self.ts.len() + self.ts_new.len();
-
-        self.ts_new.push(t);
-        self.is_fitted = false;
-
-        idx
-    }
-
-    fn allocate(&mut self) {
-        todo!();
-
-        let n_new = self.ts_new.len();
-        // let zeros =
-        /*
-        n_new = len(self.ts_new)
-        zeros = np.zeros(n_new)
-        self.ts = np.concatenate((self.ts, self.ts_new))
-        self.ms = np.concatenate((self.ms, zeros))
-        self.vs = np.concatenate((self.vs, self.kernel.k_diag(self.ts_new)))
-        self.ns = np.concatenate((self.ns, zeros))
-        self.xs = np.concatenate((self.xs, zeros))
-        # Clear the list of pending samples.
-        self.ts_new = list()
-        */
-    }
-
-    fn fit(&mut self) {
-        todo!();
-    }
-}
--- a/src/fitter/recursive.rs
+++ b/src/fitter/recursive.rs
@@ -1,5 +1,3 @@
-use std::mem;
-
 use ndarray::prelude::*;
 use ndarray::stack;

@@ -18,8 +16,8 @@ pub struct RecursiveFitter {
    is_fitted: bool,
    h: Array1<f64>,
    i: ArrayD<f64>,
-    a: ArrayD<f64>,
-    q: ArrayD<f64>,
+    a: Vec<Array2<f64>>,
+    q: Vec<Array2<f64>>,
    m_p: ArrayD<f64>,
    p_p: ArrayD<f64>,
    m_f: ArrayD<f64>,
@@ -44,8 +42,8 @@ impl RecursiveFitter {
            is_fitted: true,
            h,
            i: Array::eye(m).into_dyn(),
-            a: Array::zeros((0, m, m)).into_dyn(),
-            q: Array::zeros((0, m, m)).into_dyn(),
+            a: Vec::new(),
+            q: Vec::new(),
            m_p: Array::zeros((0, m)).into_dyn(),
            p_p: Array::zeros((0, m, m)).into_dyn(),
            m_f: Array::zeros((0, m)).into_dyn(),
@@ -90,7 +88,7 @@ impl Fitter for RecursiveFitter {
            .collect::<Vec<f64>>();

        let mean = Array::from_shape_vec((self.ts_new.len(), self.kernel.order()), mean)
-            .unwrap()
+            .expect("failed to create mean matrix")
            .into_dyn();

        let cov = self
@@ -109,7 +107,7 @@ impl Fitter for RecursiveFitter {
            (self.ts_new.len(), self.kernel.order(), self.kernel.order()),
            cov,
        )
-        .unwrap()
+        .expect("failed to create cov matrix")
        .into_dyn();

        self.m_p = stack![Axis(0), self.m_p, mean];
@@ -122,35 +120,58 @@ impl Fitter for RecursiveFitter {
        self.p_s = stack![Axis(0), self.p_s, cov];

        // Compute the new transition and noise covariance matrices.
-        let m = self.kernel.order();
-
-        self.a = stack![Axis(0), self.a, Array::zeros((n_new, m, m)).into_dyn()];
-        self.q = stack![Axis(0), self.q, Array::zeros((n_new, m, m)).into_dyn()];
-
        for i in (self.ts.len() - n_new)..self.ts.len() {
            if i == 0 {
                continue;
            }

-            self.a[i - 1] = self.kernel.transition(self.ts[i - 1], self.ts[1]);
-            // self.q[i - 1] = self.kernel.noise_cov(self.ts[i - 1], self.ts[1]);
+            self.a
+                .push(self.kernel.transition(self.ts[i - 1], self.ts[i]));
+            self.q
+                .push(self.kernel.noise_cov(self.ts[i - 1], self.ts[i]));
        }

-        todo!();
+        let m = self.kernel.order();

-        /*
-        for i in range(len(self.ts) - n_new, len(self.ts)):
-            if i == 0:
-                # Very first sample, no need to compute anything.
-                continue
-            self._A[i-1] = self.kernel.transition(self.ts[i-1], self.ts[i])
-            self._Q[i-1] = self.kernel.noise_cov(self.ts[i-1], self.ts[i])
-        # Clear the list of pending samples.
-        self.ts_new = list()
-        */
+        self.a.push(Array2::zeros((m, m)));
+        self.q.push(Array2::zeros((m, m)));
+
+        self.ts_new.clear();
    }

    fn fit(&mut self) {
        todo!();
    }
+
+    fn vs(&self, idx: usize) -> f64 {
+        *&self.vs[idx]
+    }
+
+    fn vs_mut(&mut self, idx: usize) -> &mut f64 {
+        &mut self.vs[idx]
+    }
+
+    fn ms(&self, idx: usize) -> f64 {
+        *&self.ms[idx]
+    }
+
+    fn ms_mut(&mut self, idx: usize) -> &mut f64 {
+        &mut self.ms[idx]
+    }
+
+    fn xs(&self, idx: usize) -> f64 {
+        *&self.xs[idx]
+    }
+
+    fn xs_mut(&mut self, idx: usize) -> &mut f64 {
+        &mut self.xs[idx]
+    }
+
+    fn ns(&self, idx: usize) -> f64 {
+        *&self.ns[idx]
+    }
+
+    fn ns_mut(&mut self, idx: usize) -> &mut f64 {
+        &mut self.ns[idx]
+    }
 }
--- a/src/kernel.rs
+++ b/src/kernel.rs
@@ -14,7 +14,38 @@ pub trait Kernel {
    fn state_mean(&self, t: f64) -> Array1<f64>;
    fn state_cov(&self, t: f64) -> Array2<f64>;
    fn measurement_vector(&self) -> Array1<f64>;
-    fn transition(&self, t0: f64, t1: f64) -> Array2<f64>;
+    fn feedback(&self) -> Array2<f64>;
+
+    fn transition(&self, t0: f64, t1: f64) -> Array2<f64> {
+        let f = self.feedback();
+
+        let a = f * (t1 - t0);
+        let mut b = Array2::<f64>::zeros(a.dim());
+
+        crate::expm::expm(&a, &mut b);
+
+        b
+    }
+
+    fn noise_cov(&self, t0: f64, t1: f64) -> Array2<f64> {
+        /*
+        mat = self.noise_effect.dot(self.noise_density).dot(self.noise_effect.T)
+        #print(g)
+        print(mat)
+        Phi = np.vstack((
+                np.hstack((self.feedback, mat)),
+                np.hstack((np.zeros_like(mat), -self.feedback.T))))
+        print(Phi)
+        m = self.order
+        AB = np.dot(sp.linalg.expm(Phi * (t2 - t1)), np.eye(2*m, m, k=-m))
+        print(AB)
+        return sp.linalg.solve(AB[m:,:].T, AB[:m,:].T)
+        */
+
+        // let mat = self.noise_effect()
+
+        todo!();
+    }
 }

 impl Kernel for Vec<Box<dyn Kernel>> {
@@ -74,7 +105,57 @@ impl Kernel for Vec<Box<dyn Kernel>> {
        Array1::from(data)
    }

-    fn transition(&self, t0: f64, t1: f64) -> Array2<f64> {
-        todo!();
+    fn feedback(&self) -> Array2<f64> {
+        let data = self
+            .iter()
+            .map(|kernel| kernel.feedback())
+            .collect::<Vec<_>>();
+
+        let dim = data
+            .iter()
+            .fold((0, 0), |(w, h), m| (w + m.ncols(), h + m.nrows()));
+
+        let mut feedback = Array2::zeros(dim);
+
+        let mut r_d = 0;
+        let mut c_d = 0;
+
+        for m in data {
+            for ((r, c), v) in m.indexed_iter() {
+                feedback[(r + r_d, c + c_d)] = *v;
+            }
+
+            r_d += m.nrows();
+            c_d += m.ncols();
+        }
+
+        feedback
+    }
+
+    fn noise_cov(&self, t0: f64, t1: f64) -> Array2<f64> {
+        let data = self
+            .iter()
+            .map(|kernel| kernel.noise_cov(t0, t1))
+            .collect::<Vec<_>>();
+
+        let dim = data
+            .iter()
+            .fold((0, 0), |(w, h), m| (w + m.ncols(), h + m.nrows()));
+
+        let mut cov = Array2::zeros(dim);
+
+        let mut r_d = 0;
+        let mut c_d = 0;
+
+        for m in data {
+            for ((r, c), v) in m.indexed_iter() {
+                cov[(r + r_d, c + c_d)] = *v;
+            }
+
+            r_d += m.nrows();
+            c_d += m.ncols();
+        }
+
+        cov
    }
 }
--- a/src/kernel/constant.rs
+++ b/src/kernel/constant.rs
@@ -33,7 +33,15 @@ impl Kernel for Constant {
        array![1.0]
    }

+    fn feedback(&self) -> Array2<f64> {
+        array![[0.0]]
+    }
+
    fn transition(&self, t0: f64, t1: f64) -> Array2<f64> {
-        todo!();
+        array![[1.0]]
+    }
+
+    fn noise_cov(&self, t0: f64, t1: f64) -> Array2<f64> {
+        array![[0.0]]
    }
 }
--- a/src/kernel/exponential.rs
+++ b/src/kernel/exponential.rs
@@ -34,7 +34,15 @@ impl Kernel for Exponential {
        array![1.0]
    }

+    fn feedback(&self) -> Array2<f64> {
+        (-1.0 / self.l_scale) * array![[1.0]]
+    }
+
    fn transition(&self, t0: f64, t1: f64) -> Array2<f64> {
-        todo!();
+        -(t1 - t0) / self.l_scale * array![[1.0]]
+    }
+
+    fn noise_cov(&self, t0: f64, t1: f64) -> Array2<f64> {
+        self.var * (1.0 - (-2.0 * (t1 - t0) / self.l_scale).exp()) * array![[1.0]]
    }
 }
--- a/src/kernel/matern52.rs
+++ b/src/kernel/matern52.rs
@@ -45,7 +45,56 @@ impl Kernel for Matern52 {
        array![1.0, 0.0, 0.0]
    }

+    fn feedback(&self) -> Array2<f64> {
+        let a = self.lambda;
+
+        array![
+            [0.0, 1.0, 0.0],
+            [0.0, 0.0, 1.0],
+            [-a.powi(3), -3.0 * a.powi(2), -3.0 * a],
+        ]
+    }
+
    fn transition(&self, t0: f64, t1: f64) -> Array2<f64> {
-        todo!();
+        let d = t1 - t0;
+        let a = self.lambda;
+        let da = d * a;
+
+        let ba = array![
+            [(da * da) / 2.0 + da + 1.0, d * (da + 1.0), d * d / 2.0],
+            [
+                -(da * da * a) / 2.0,
+                -da * da + da + 1.0,
+                -(d / 2.0) * (da - 2.0)
+            ],
+            [
+                (da * a * a / 2.0) * (da - 2.0),
+                (da * a) * (da - 3.0),
+                (da * da - 4.0 * da + 2.0) / 2.0
+            ]
+        ];
+
+        (-da).exp() * ba
+    }
+
+    fn noise_cov(&self, t0: f64, t1: f64) -> Array2<f64> {
+        let d = t1 - t0;
+        let a = self.lambda;
+        let da = d * a;
+
+        let c = (-2.0 * da).exp();
+
+        let x11 = -(1.0 / 3.0)
+            * (c * (2.0 * da.powi(4) + 4.0 * da.powi(3) + 6.0 * da * da + 6.0 * da + 3.0) - 3.0);
+        let x12 = c * (2.0 / 3.0) * a * da.powi(4);
+        let x13 = -(a * a / 3.0)
+            * (c * (2.0 * da.powi(4) - 4.0 * da.powi(3) - 2.0 * da * da - 2.0 * da - 1.0) + 1.0);
+        let x22 = -(a * a / 3.0)
+            * (c * (2.0 * da.powi(4) - 4.0 * da.powi(3) + 2.0 * da * da + 2.0 * da + 1.0) - 1.0);
+        let x23 = c * (2.0 / 3.0) * da * da * a.powi(3) * (da - 2.0).powi(2);
+        let x33 = -(a.powi(4) / 3.0)
+            * (c * (2.0 * da.powi(4) - 12.0 * da.powi(3) + 22.0 * da * da - 10.0 * da + 3.0) - 3.0);
+
+        self.var * array![[x11, x12, x13], [x12, x22, x23], [x13, x23, x33]]
    }
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,10 +1,14 @@
 // https://github.com/lucasmaystre/kickscore/tree/master/kickscore
+
+mod condest;
+mod expm;
 mod fitter;
 mod item;
 pub mod kernel;
 mod model;
 pub mod observation;
 mod storage;
+mod utils;

 pub use kernel::Kernel;
 pub use model::*;
--- a/src/model.rs
+++ b/src/model.rs
@@ -93,8 +93,8 @@ impl BinaryModel {

            for obs in &mut self.observations {
                let diff = match method {
-                    BinaryModelFitMethod::Ep => obs.ep_update(lr),
-                    BinaryModelFitMethod::Kl => obs.kl_update(lr),
+                    BinaryModelFitMethod::Ep => obs.ep_update(lr, &mut self.storage),
+                    BinaryModelFitMethod::Kl => obs.kl_update(lr, &mut self.storage),
                };

                if diff > max_diff {
--- a/src/observation.rs
+++ b/src/observation.rs
@@ -1,8 +1,12 @@
 mod ordinal;

+use crate::storage::Storage;
+
 pub use ordinal::*;

 pub trait Observation {
-    fn ep_update(&mut self, lr: f64) -> f64;
-    fn kl_update(&mut self, lr: f64) -> f64;
+    fn match_moments(&self, mean_cav: f64, cov_cav: f64) -> (f64, f64, f64);
+
+    fn ep_update(&mut self, lr: f64, storage: &mut Storage) -> f64;
+    fn kl_update(&mut self, lr: f64, storage: &mut Storage) -> f64;
 }
--- a/src/observation/ordinal.rs
+++ b/src/observation/ordinal.rs
@@ -1,7 +1,18 @@
 use crate::storage::Storage;
+use crate::utils::logphi;

 use super::Observation;

+fn mm_probit_win(mean_cav: f64, cov_cav: f64) -> (f64, f64, f64) {
+    // Adapted from the GPML function `likErf.m`.
+    let z = mean_cav / (1.0 + cov_cav).sqrt();
+    let (logpart, val) = logphi(z);
+    let dlogpart = val / (1.0 + cov_cav).sqrt(); // 1st derivative w.r.t. mean.
+    let d2logpart = -val * (z + val) / (1.0 + cov_cav);
+
+    (logpart, dlogpart, d2logpart)
+}
+
 pub struct ProbitWinObservation {
    m: usize,
    items: Vec<usize>,
@@ -10,7 +21,7 @@ pub struct ProbitWinObservation {
    ns_cav: Vec<f64>,
    xs_cav: Vec<f64>,
    t: f64,
-    logpart: usize,
+    logpart: f64,
    exp_ll: usize,
    margin: f64,
 }
@@ -29,10 +40,10 @@ impl ProbitWinObservation {
                .iter()
                .map(|(id, _)| storage.get_item(*id).fitter.add_sample(t))
                .collect(),
-            ns_cav: Vec::new(),
-            xs_cav: Vec::new(),
+            ns_cav: (0..elems.len()).map(|_| 0.0).collect(),
+            xs_cav: (0..elems.len()).map(|_| 0.0).collect(),
            t,
-            logpart: 0,
+            logpart: 0.0,
            exp_ll: 0,
            margin,
        }
@@ -40,11 +51,63 @@ impl ProbitWinObservation {
 }

 impl Observation for ProbitWinObservation {
-    fn ep_update(&mut self, lr: f64) -> f64 {
-        todo!();
+    fn match_moments(&self, mean_cav: f64, cov_cav: f64) -> (f64, f64, f64) {
+        mm_probit_win(mean_cav - self.margin, cov_cav)
    }

-    fn kl_update(&mut self, lr: f64) -> f64 {
+    fn ep_update(&mut self, lr: f64, storage: &mut Storage) -> f64 {
+        // Mean and variance of the cavity distribution in function space.
+        let mut f_mean_cav = 0.0;
+        let mut f_var_cav = 0.0;
+
+        for i in 0..self.m {
+            let item = storage.item(self.items[i]);
+            let idx = self.indices[i];
+            let coeff = self.coeffs[i];
+
+            // Compute the natural parameters of the cavity distribution.
+            let x_tot = 1.0 / item.fitter.vs(idx);
+            let n_tot = x_tot * item.fitter.ms(idx);
+            let x_cav = x_tot - item.fitter.xs(idx);
+            let n_cav = n_tot - item.fitter.ns(idx);
+
+            self.xs_cav[i] = x_cav;
+            self.ns_cav[i] = n_cav;
+
+            // Adjust the function-space cavity mean & variance.
+            f_mean_cav += coeff * n_cav / x_cav;
+            f_var_cav += coeff * coeff / x_cav;
+        }
+
+        // Moment matching.
+        let (logpart, dlogpart, d2logpart) = self.match_moments(f_mean_cav, f_var_cav);
+
+        for i in 0..self.m {
+            let item = storage.item_mut(self.items[i]);
+            let idx = self.indices[i];
+            let coeff = self.coeffs[i];
+
+            let x_cav = self.xs_cav[i];
+            let n_cav = self.ns_cav[i];
+
+            // Update the elements' parameters.
+            let denom = 1.0 + coeff * coeff * d2logpart / x_cav;
+            let x = -coeff * coeff * d2logpart / denom;
+            let n = coeff * (dlogpart - coeff * (n_cav / x_cav) * d2logpart) / denom;
+
+            *item.fitter.xs_mut(idx) = (1.0 - lr) * item.fitter.xs(idx) + lr * x;
+            *item.fitter.ns_mut(idx) = (1.0 - lr) * item.fitter.ns(idx) + lr * n;
+        }
+
+        let diff = (self.logpart - logpart).abs();
+
+        // Save log partition function value for the log-likelihood.
+        self.logpart = logpart;
+
+        diff
+    }
+
+    fn kl_update(&mut self, lr: f64, storage: &mut Storage) -> f64 {
        todo!();
    }
 }
@@ -60,11 +123,15 @@ impl LogitWinObservation {
 }

 impl Observation for LogitWinObservation {
-    fn ep_update(&mut self, lr: f64) -> f64 {
+    fn match_moments(&self, mean_cav: f64, cov_cav: f64) -> (f64, f64, f64) {
        todo!();
    }

-    fn kl_update(&mut self, lr: f64) -> f64 {
+    fn ep_update(&mut self, lr: f64, storage: &mut Storage) -> f64 {
+        todo!();
+    }
+
+    fn kl_update(&mut self, lr: f64, storage: &mut Storage) -> f64 {
        todo!();
    }
 }
--- a/src/storage.rs
+++ b/src/storage.rs
@@ -34,6 +34,14 @@ impl Storage {
        &mut self.items[id]
    }

+    pub fn item(&self, id: usize) -> &Item {
+        &self.items[id]
+    }
+
+    pub fn item_mut(&mut self, id: usize) -> &mut Item {
+        &mut self.items[id]
+    }
+
    pub fn items_mut(&mut self) -> impl Iterator<Item = &mut Item> {
        self.items.iter_mut()
    }
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -0,0 +1,113 @@
+use std::f64::consts::PI;
+
+const CS: [f64; 14] = [
+    0.00048204,
+    -0.00142906,
+    0.0013200243174,
+    0.0009461589032,
+    -0.0045563339802,
+    0.00556964649138,
+    0.00125993961762116,
+    -0.01621575378835404,
+    0.02629651521057465,
+    -0.001829764677455021,
+    2.0 * (1.0 - PI / 3.0),
+    (4.0 - PI) / 3.0,
+    1.0,
+    1.0,
+];
+
+const RS: [f64; 5] = [
+    1.2753666447299659525,
+    5.019049726784267463450,
+    6.1602098531096305441,
+    7.409740605964741794425,
+    2.9788656263939928886,
+];
+
+const QS: [f64; 6] = [
+    2.260528520767326969592,
+    9.3960340162350541504,
+    12.048951927855129036034,
+    17.081440747466004316,
+    9.608965327192787870698,
+    3.3690752069827527677,
+];
+
+/// Normal cumulative density function.
+fn normcdf(x: f64) -> f64 {
+    // If X ~ N(0,1), returns P(X < x).
+    // erfc(-x / SQRT2) / 2.0
+
+    todo!();
+}
+
+/// Compute the log of the normal cumulative density function.
+pub fn logphi(z: f64) -> (f64, f64) {
+    // Adapted from the GPML function `logphi.m`.
+    let SQRT2: f64 = 2.0f64.sqrt();
+    let SQRT2PI: f64 = (2.0 * PI).sqrt();
+
+    if z * z < 0.0492 {
+        // First case: z close to zero.
+        let coef = -z / SQRT2PI;
+        let mut val = 0.0;
+
+        for c in &CS {
+            val = coef * (c + val);
+        }
+
+        let res = -2.0 * val - 2.0f64.ln();
+        let dres = (-(z * z) / 2.0 - res).exp() / SQRT2PI;
+
+        (res, dres)
+    } else if z < -11.3137 {
+        // Second case: z very small.
+        let mut num = 0.5641895835477550741;
+
+        for r in &RS {
+            num = -z * num / SQRT2 + r;
+        }
+
+        let mut den = 1.0;
+
+        for q in &QS {
+            den = -z * den / SQRT2 + q;
+        }
+
+        let res = (num / (2.0 * den)).ln() - (z * z) / 2.0;
+        let dres = (den / num).abs() * (2.0 / PI).sqrt();
+
+        (res, dres)
+    } else {
+        let res = normcdf(z).ln();
+        let dres = (-(z * z) / 2.0 - res).exp() / SQRT2PI;
+
+        (res, dres)
+    }
+
+    /*
+    if z * z < 0.0492:
+        # First case: z close to zero.
+        coef = -z / SQRT2PI
+        val = 0
+        for c in CS:
+            val = coef * (c + val)
+        res = -2 * val - log(2)
+        dres = exp(-(z * z) / 2 - res) / SQRT2PI
+    elif z < -11.3137:
+        # Second case: z very small.
+        num = 0.5641895835477550741
+        for r in RS:
+            num = -z * num / SQRT2 + r
+        den = 1.0
+        for q in QS:
+            den = -z * den / SQRT2 + q
+        res = log(num / (2 * den)) - (z * z) / 2
+        dres = abs(den / num) * sqrt(2.0 / pi)
+    else:
+        res = log(normcdf(z))
+        dres = exp(-(z * z) / 2 - res) / SQRT2PI
+    return res, dres
+    */
+}