Improve ascii handling, improve performance

This commit is contained in:
2022-05-24 21:13:43 +02:00
parent 9f44196e6c
commit 17ea7bc60c
3 changed files with 26 additions and 6 deletions

View File

@@ -11,7 +11,7 @@ name = "bench"
harness = false
[dependencies]
fst = "0.4.7"
# fst = "0.4.7"
tinyvec = { version = "1.6.0", features = ["alloc"] }
u-fst = { path = "../u-fst" }
@@ -22,4 +22,5 @@ u-fst = { path = "../u-fst" }
criterion = "0.3.5"
proptest = "1.0.0"
similar-asserts = "1.2.0"
unic-normal = "0.9.0"
unicode-normalization = "0.1.19"

View File

@@ -3,6 +3,7 @@ use std::fs;
use criterion::{criterion_group, criterion_main, Criterion};
use u_norm::nfd;
use unic_normal::StrNormalForm;
use unicode_normalization::UnicodeNormalization;
const ASCII: &str = "all types of normalized";
@@ -10,8 +11,13 @@ const ASCII: &str = "all types of normalized";
fn criterion_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("ASCII");
group.bench_function("unf", |b| b.iter(|| nfd(ASCII).count()));
group.bench_function("unicode-normalization", |b| b.iter(|| ASCII.nfd().count()));
group.bench_function("u-norm", |b| b.iter(|| nfd(ASCII).count()));
group.bench_function("unicode-normalization", |b| {
b.iter(|| UnicodeNormalization::nfd(ASCII).count())
});
group.bench_function("unic-normal", |b| {
b.iter(|| StrNormalForm::nfd(ASCII).count())
});
group.finish();
@@ -19,8 +25,13 @@ fn criterion_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("Long");
group.bench_function("unf", |b| b.iter(|| nfd(&long).count()));
group.bench_function("unicode-normalization", |b| b.iter(|| long.nfd().count()));
group.bench_function("u-norm", |b| b.iter(|| nfd(&long).count()));
group.bench_function("unicode-normalization", |b| {
b.iter(|| UnicodeNormalization::nfd(long.as_str()).count())
});
group.bench_function("unic-normal", |b| {
b.iter(|| StrNormalForm::nfd(long.as_str()).count())
});
group.finish();
}

View File

@@ -26,6 +26,14 @@ impl Buffer {
}
}
#[inline(always)]
fn push_zero(&mut self, ch: char) {
self.sort_pending();
self.buffer.push((0, ch));
self.ready.end = self.buffer.len();
}
#[inline(always)]
fn push_back(&mut self, ch: char) {
let class = table::lookup(ch).combining_class();
@@ -125,7 +133,7 @@ const S_COUNT: u32 = L_COUNT * N_COUNT;
fn decompose(c: char, buffer: &mut Buffer) {
// 7-bit ASCII never decomposes
if c <= '\x7f' {
buffer.push_back(c);
buffer.push_zero(c);
return;
}