Rename crates
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
[package]
|
||||
name = "ufst"
|
||||
name = "u-fst"
|
||||
version = "0.4.7" #:version
|
||||
authors = ["Andrew Gallant <jamslam@gmail.com>", "Anders Olsson <anders.e.olsson@gmail.com>"]
|
||||
description = """
|
||||
@@ -1,16 +1,25 @@
|
||||
[package]
|
||||
name = "unf"
|
||||
name = "u-norm"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[lib]
|
||||
bench = false
|
||||
|
||||
[[bench]]
|
||||
name = "bench"
|
||||
harness = false
|
||||
|
||||
[dependencies]
|
||||
fst = "0.4.7"
|
||||
tinyvec = { version = "1.6.0", features = ["alloc"] }
|
||||
ufst = { path = "../ufst" }
|
||||
u-fst = { path = "../u-fst" }
|
||||
|
||||
[build-dependencies]
|
||||
ufst = { path = "../ufst" }
|
||||
u-fst = { path = "../u-fst" }
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = "0.3.5"
|
||||
proptest = "1.0.0"
|
||||
similar-asserts = "1.2.0"
|
||||
unicode-normalization = "0.1.19"
|
||||
5
crates/u-norm/README.md
Normal file
5
crates/u-norm/README.md
Normal file
@@ -0,0 +1,5 @@
|
||||
# UNF
|
||||
|
||||
## Todo
|
||||
|
||||
- [ ] Change Decomposition to be `struct Decomposition(u64)` that implements `Iterator`
|
||||
29
crates/u-norm/benches/bench.rs
Normal file
29
crates/u-norm/benches/bench.rs
Normal file
@@ -0,0 +1,29 @@
|
||||
use std::fs;
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
|
||||
use u_norm::nfd;
|
||||
use unicode_normalization::UnicodeNormalization;
|
||||
|
||||
const ASCII: &str = "all types of normalized";
|
||||
|
||||
fn criterion_benchmark(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("ASCII");
|
||||
|
||||
group.bench_function("unf", |b| b.iter(|| nfd(ASCII).count()));
|
||||
group.bench_function("unicode-normalization", |b| b.iter(|| ASCII.nfd().count()));
|
||||
|
||||
group.finish();
|
||||
|
||||
let long = fs::read_to_string("benches/long.txt").unwrap();
|
||||
|
||||
let mut group = c.benchmark_group("Long");
|
||||
|
||||
group.bench_function("unf", |b| b.iter(|| nfd(&long).count()));
|
||||
group.bench_function("unicode-normalization", |b| b.iter(|| long.nfd().count()));
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(benches, criterion_benchmark);
|
||||
criterion_main!(benches);
|
||||
2366
crates/u-norm/benches/long.txt
Normal file
2366
crates/u-norm/benches/long.txt
Normal file
File diff suppressed because one or more lines are too long
@@ -2,7 +2,7 @@ use std::env;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
use ufst::raw::Fst;
|
||||
use u_fst::raw::Fst;
|
||||
|
||||
fn main() {
|
||||
let data = fs::read_to_string("data/UnicodeData.txt").unwrap();
|
||||
@@ -6,8 +6,6 @@ use tinyvec::TinyVec;
|
||||
|
||||
pub mod table;
|
||||
|
||||
use table::Decomposition;
|
||||
|
||||
pub fn nfd(s: &str) -> Decompositions<Chars<'_>> {
|
||||
Decompositions {
|
||||
iter: s.chars().fuse(),
|
||||
@@ -28,6 +26,7 @@ impl Buffer {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn push_back(&mut self, ch: char) {
|
||||
let class = table::lookup(ch).combining_class();
|
||||
|
||||
@@ -41,10 +40,12 @@ impl Buffer {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn sort_pending(&mut self) {
|
||||
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn reset(&mut self) {
|
||||
let pending = self.buffer.len() - self.ready.end;
|
||||
|
||||
@@ -56,6 +57,7 @@ impl Buffer {
|
||||
self.ready = 0..0;
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn increment_next_ready(&mut self) {
|
||||
let next = self.ready.start + 1;
|
||||
|
||||
@@ -134,14 +136,8 @@ fn decompose(c: char, buffer: &mut Buffer) {
|
||||
}
|
||||
|
||||
if let Some(decomposed) = table::lookup(c).decomposition() {
|
||||
match decomposed {
|
||||
Decomposition::Single(f) => {
|
||||
decompose(f, buffer);
|
||||
}
|
||||
Decomposition::Double(f, s) => {
|
||||
decompose(f, buffer);
|
||||
decompose(s, buffer);
|
||||
}
|
||||
for d in decomposed {
|
||||
decompose(d, buffer);
|
||||
}
|
||||
return;
|
||||
}
|
||||
14
crates/u-norm/src/main.rs
Normal file
14
crates/u-norm/src/main.rs
Normal file
@@ -0,0 +1,14 @@
|
||||
use u_norm::table;
|
||||
|
||||
fn main() {
|
||||
for c in '\x00'..='\x7f' {
|
||||
let d = table::lookup(c);
|
||||
|
||||
println!(
|
||||
"{:?} class: {}, decomp: {:?}",
|
||||
c,
|
||||
d.combining_class(),
|
||||
d.decomposition().map(|d| d.collect::<Vec<_>>())
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,8 +1,9 @@
|
||||
use ufst::raw::Fst;
|
||||
use u_fst::raw::Fst;
|
||||
|
||||
const TABLE: Fst<&'static [u8]> =
|
||||
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
|
||||
|
||||
#[inline(always)]
|
||||
pub fn lookup(ch: char) -> Entry {
|
||||
Entry::new(
|
||||
TABLE
|
||||
@@ -13,9 +14,23 @@ pub fn lookup(ch: char) -> Entry {
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Debug)]
|
||||
pub enum Decomposition {
|
||||
Single(char),
|
||||
Double(char, char),
|
||||
pub struct Decomposition(u64);
|
||||
|
||||
impl Iterator for Decomposition {
|
||||
type Item = char;
|
||||
|
||||
#[inline(always)]
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let d = (self.0 & 0x1FFFFF) as u32;
|
||||
|
||||
if d > 0 {
|
||||
self.0 >>= 21;
|
||||
|
||||
Some(unsafe { char::from_u32_unchecked(d) })
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Debug)]
|
||||
@@ -26,26 +41,17 @@ impl Entry {
|
||||
Self(data)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn combining_class(&self) -> u8 {
|
||||
(self.0 & 0xFF) as u8
|
||||
}
|
||||
|
||||
pub(crate) fn decomposition(&self) -> Option<Decomposition> {
|
||||
let m1 = ((self.0 >> 8) & 0x1FFFFF) as u32;
|
||||
#[inline(always)]
|
||||
pub fn decomposition(&self) -> Option<Decomposition> {
|
||||
let data = self.0 >> 8;
|
||||
|
||||
if m1 > 0 {
|
||||
let m2 = ((self.0 >> 29) & 0x1FFFFF) as u32;
|
||||
|
||||
if m2 > 0 {
|
||||
unsafe {
|
||||
Some(Decomposition::Double(
|
||||
char::from_u32_unchecked(m1),
|
||||
char::from_u32_unchecked(m2),
|
||||
))
|
||||
}
|
||||
} else {
|
||||
unsafe { Some(Decomposition::Single(char::from_u32_unchecked(m1))) }
|
||||
}
|
||||
if data > 0 {
|
||||
Some(Decomposition(data))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@@ -99,16 +105,14 @@ mod tests {
|
||||
|
||||
prop_assert_eq!(b.combining_class(), combining_class, "data = {:064b}", data);
|
||||
|
||||
let c = b.decomposition().map(|i| i.collect::<Vec<_>>());
|
||||
|
||||
match mapping_count {
|
||||
0 => prop_assert_eq!(b.decomposition(), None, "data = {:064b}", data),
|
||||
1 => prop_assert_eq!(b.decomposition(), Some(Decomposition::Single(decomposition_first)), "data = {:064b}", data),
|
||||
2 => prop_assert_eq!(b.decomposition(), Some(Decomposition::Double(decomposition_first, decomposition_second)), "data = {:064b}", data),
|
||||
0 => prop_assert_eq!(c, None, "data = {:064b}", data),
|
||||
1 => prop_assert_eq!(c, Some(vec![decomposition_first]), "data = {:064b}", data),
|
||||
2 => prop_assert_eq!(c, Some(vec![decomposition_first, decomposition_second]), "data = {:064b}", data),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
|
||||
|
||||
// prop_assert_eq!(a, b, "data = {:064b}", data);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,16 +1,17 @@
|
||||
[package]
|
||||
name = "smol-uca"
|
||||
name = "u-sort"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
ufst = { path = "../ufst" }
|
||||
unf = { path = "../unf" }
|
||||
parse = { path = "../parse" }
|
||||
u-fst = { path = "../u-fst" }
|
||||
u-norm = { path = "../u-norm" }
|
||||
|
||||
[build-dependencies]
|
||||
bytemuck = "1.9.1"
|
||||
parse = { path = "../parse" }
|
||||
ufst = { path = "../ufst" }
|
||||
u-fst = { path = "../u-fst" }
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1.0.0"
|
||||
@@ -3,7 +3,7 @@ use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
use parse::uca::allkeys;
|
||||
use ufst::raw::Builder;
|
||||
use u_fst::raw::Builder;
|
||||
|
||||
fn main() {
|
||||
println!("cargo:rerun-if-changed=data/allkeys.txt");
|
||||
@@ -52,7 +52,8 @@ impl Collator {
|
||||
|
||||
// advance to last combining C
|
||||
while tail_index < code_points_len {
|
||||
let combining_class = unf::table::lookup(code_points[tail_index]).combining_class();
|
||||
let combining_class =
|
||||
u_norm::table::lookup(code_points[tail_index]).combining_class();
|
||||
|
||||
if debug {
|
||||
eprintln!(
|
||||
@@ -282,7 +283,7 @@ impl Collator {
|
||||
}
|
||||
|
||||
pub fn sort_key<S: AsRef<str>>(&self, input: S) -> Vec<u16> {
|
||||
let normalized = unf::nfd(input.as_ref()).collect::<Vec<_>>();
|
||||
let normalized = u_norm::nfd(input.as_ref()).collect::<Vec<_>>();
|
||||
let collation_elements = self.collation_elements(&normalized);
|
||||
|
||||
self.sort_key_from_collation_elements(&collation_elements)
|
||||
@@ -346,7 +347,7 @@ mod tests {
|
||||
"04D0 | 0020 004A 0024 | 0002 0002 0002 |",
|
||||
fmt(&sort_key),
|
||||
"nfd: {:?}",
|
||||
unf::nfd(fixture)
|
||||
u_norm::nfd(fixture)
|
||||
.map(|ch| format!("{:04X}", ch as u32))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
@@ -364,7 +365,7 @@ mod tests {
|
||||
"FB00 9D00 0268 | 0020 0020 | 0002 0002 |",
|
||||
fmt(&sort_key),
|
||||
"nfd: {:?}",
|
||||
unf::nfd(fixture)
|
||||
u_norm::nfd(fixture)
|
||||
.map(|ch| format!("{:04X}", ch as u32))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
52
crates/u-sort/src/main.rs
Normal file
52
crates/u-sort/src/main.rs
Normal file
@@ -0,0 +1,52 @@
|
||||
use std::cmp;
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
|
||||
use parse::uca::allkeys;
|
||||
|
||||
fn main() {
|
||||
let allkeys = {
|
||||
let data = fs::read_to_string("data/allkeys.txt").unwrap();
|
||||
|
||||
allkeys::parse(&data)
|
||||
};
|
||||
|
||||
let mut l1 = 0;
|
||||
let mut l2 = 0;
|
||||
let mut l3 = 0;
|
||||
let mut l4 = 0;
|
||||
|
||||
let mut count = HashMap::new();
|
||||
|
||||
for entry in allkeys.entries {
|
||||
count
|
||||
.entry(entry.elements.len())
|
||||
.and_modify(|x| *x += 1)
|
||||
.or_insert(1);
|
||||
|
||||
for element in entry.elements {
|
||||
l1 = cmp::max(l1, element.l1);
|
||||
l2 = cmp::max(l2, element.l2);
|
||||
l3 = cmp::max(l3, element.l3);
|
||||
l4 = cmp::max(l4, element.l4);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
l1 = 16 bits
|
||||
l2 = 9 bits
|
||||
l3 = 5 bits
|
||||
l4 = 0 bits
|
||||
variable = 1 bit
|
||||
|
||||
total = 31 bits
|
||||
*/
|
||||
|
||||
println!("l1: {} - {} bit(s)", l1, u16::BITS - l1.leading_zeros());
|
||||
println!("l2: {} - {} bit(s)", l2, u16::BITS - l2.leading_zeros());
|
||||
println!("l3: {} - {} bit(s)", l3, u8::BITS - l3.leading_zeros());
|
||||
println!("l4: {} - {} bit(s)", l4, u16::BITS - l4.leading_zeros());
|
||||
println!("variable: 1 bit(s)");
|
||||
println!();
|
||||
println!("{:#?}", count);
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::fmt::Display;
|
||||
|
||||
use ufst::raw::{Fst, Output};
|
||||
use u_fst::raw::{Fst, Output};
|
||||
|
||||
const TABLE: Fst<&'static [u8]> =
|
||||
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
|
||||
use smol_uca::collator::Collator;
|
||||
use u_sort::collator::Collator;
|
||||
|
||||
#[test]
|
||||
fn collation_test_non_ignorable() {
|
||||
@@ -55,7 +55,7 @@ fn collation_test_non_ignorable() {
|
||||
let expected_sort_key = rest.rsplit(['[', ']']).nth(1).expect("sort key");
|
||||
|
||||
let sort_key = collator.sort_key(&test_string);
|
||||
let fmt_sort_key = smol_uca::fmt(&sort_key);
|
||||
let fmt_sort_key = u_sort::fmt(&sort_key);
|
||||
|
||||
if let Some(prev_sort_key) = prev_sort_key.take() {
|
||||
if sort_key < prev_sort_key {
|
||||
@@ -76,7 +76,7 @@ fn collation_test_non_ignorable() {
|
||||
eprintln!(
|
||||
"Error at line {}: {:?} expected: [{}], got: [{}] ({})",
|
||||
n + 1,
|
||||
unf::nfd(&test_string).collect::<String>(),
|
||||
u_norm::nfd(&test_string).collect::<String>(),
|
||||
expected_sort_key,
|
||||
fmt_sort_key,
|
||||
line
|
||||
Reference in New Issue
Block a user