diff --git a/crates/u-norm/Cargo.toml b/crates/u-norm/Cargo.toml index 7194dd1..b0a080a 100644 --- a/crates/u-norm/Cargo.toml +++ b/crates/u-norm/Cargo.toml @@ -11,7 +11,7 @@ name = "bench" harness = false [dependencies] -fst = "0.4.7" +# fst = "0.4.7" tinyvec = { version = "1.6.0", features = ["alloc"] } u-fst = { path = "../u-fst" } @@ -22,4 +22,5 @@ u-fst = { path = "../u-fst" } criterion = "0.3.5" proptest = "1.0.0" similar-asserts = "1.2.0" +unic-normal = "0.9.0" unicode-normalization = "0.1.19" diff --git a/crates/u-norm/benches/bench.rs b/crates/u-norm/benches/bench.rs index 967f24d..59b89d2 100644 --- a/crates/u-norm/benches/bench.rs +++ b/crates/u-norm/benches/bench.rs @@ -3,6 +3,7 @@ use std::fs; use criterion::{criterion_group, criterion_main, Criterion}; use u_norm::nfd; +use unic_normal::StrNormalForm; use unicode_normalization::UnicodeNormalization; const ASCII: &str = "all types of normalized"; @@ -10,8 +11,13 @@ const ASCII: &str = "all types of normalized"; fn criterion_benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("ASCII"); - group.bench_function("unf", |b| b.iter(|| nfd(ASCII).count())); - group.bench_function("unicode-normalization", |b| b.iter(|| ASCII.nfd().count())); + group.bench_function("u-norm", |b| b.iter(|| nfd(ASCII).count())); + group.bench_function("unicode-normalization", |b| { + b.iter(|| UnicodeNormalization::nfd(ASCII).count()) + }); + group.bench_function("unic-normal", |b| { + b.iter(|| StrNormalForm::nfd(ASCII).count()) + }); group.finish(); @@ -19,8 +25,13 @@ fn criterion_benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("Long"); - group.bench_function("unf", |b| b.iter(|| nfd(&long).count())); - group.bench_function("unicode-normalization", |b| b.iter(|| long.nfd().count())); + group.bench_function("u-norm", |b| b.iter(|| nfd(&long).count())); + group.bench_function("unicode-normalization", |b| { + b.iter(|| UnicodeNormalization::nfd(long.as_str()).count()) + }); + group.bench_function("unic-normal", |b| { + b.iter(|| StrNormalForm::nfd(long.as_str()).count()) + }); group.finish(); } diff --git a/crates/u-norm/src/lib.rs b/crates/u-norm/src/lib.rs index 3d0e713..cbc8c64 100644 --- a/crates/u-norm/src/lib.rs +++ b/crates/u-norm/src/lib.rs @@ -26,6 +26,14 @@ impl Buffer { } } + #[inline(always)] + fn push_zero(&mut self, ch: char) { + self.sort_pending(); + + self.buffer.push((0, ch)); + self.ready.end = self.buffer.len(); + } + #[inline(always)] fn push_back(&mut self, ch: char) { let class = table::lookup(ch).combining_class(); @@ -125,7 +133,7 @@ const S_COUNT: u32 = L_COUNT * N_COUNT; fn decompose(c: char, buffer: &mut Buffer) { // 7-bit ASCII never decomposes if c <= '\x7f' { - buffer.push_back(c); + buffer.push_zero(c); return; }