Rename crates
This commit is contained in:
@@ -1,5 +1,5 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "ufst"
|
name = "u-fst"
|
||||||
version = "0.4.7" #:version
|
version = "0.4.7" #:version
|
||||||
authors = ["Andrew Gallant <jamslam@gmail.com>", "Anders Olsson <anders.e.olsson@gmail.com>"]
|
authors = ["Andrew Gallant <jamslam@gmail.com>", "Anders Olsson <anders.e.olsson@gmail.com>"]
|
||||||
description = """
|
description = """
|
||||||
@@ -1,16 +1,25 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "unf"
|
name = "u-norm"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
|
[lib]
|
||||||
|
bench = false
|
||||||
|
|
||||||
|
[[bench]]
|
||||||
|
name = "bench"
|
||||||
|
harness = false
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
fst = "0.4.7"
|
fst = "0.4.7"
|
||||||
tinyvec = { version = "1.6.0", features = ["alloc"] }
|
tinyvec = { version = "1.6.0", features = ["alloc"] }
|
||||||
ufst = { path = "../ufst" }
|
u-fst = { path = "../u-fst" }
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
ufst = { path = "../ufst" }
|
u-fst = { path = "../u-fst" }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
|
criterion = "0.3.5"
|
||||||
proptest = "1.0.0"
|
proptest = "1.0.0"
|
||||||
similar-asserts = "1.2.0"
|
similar-asserts = "1.2.0"
|
||||||
|
unicode-normalization = "0.1.19"
|
||||||
5
crates/u-norm/README.md
Normal file
5
crates/u-norm/README.md
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
# UNF
|
||||||
|
|
||||||
|
## Todo
|
||||||
|
|
||||||
|
- [ ] Change Decomposition to be `struct Decomposition(u64)` that implements `Iterator`
|
||||||
29
crates/u-norm/benches/bench.rs
Normal file
29
crates/u-norm/benches/bench.rs
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
use std::fs;
|
||||||
|
|
||||||
|
use criterion::{criterion_group, criterion_main, Criterion};
|
||||||
|
|
||||||
|
use u_norm::nfd;
|
||||||
|
use unicode_normalization::UnicodeNormalization;
|
||||||
|
|
||||||
|
const ASCII: &str = "all types of normalized";
|
||||||
|
|
||||||
|
fn criterion_benchmark(c: &mut Criterion) {
|
||||||
|
let mut group = c.benchmark_group("ASCII");
|
||||||
|
|
||||||
|
group.bench_function("unf", |b| b.iter(|| nfd(ASCII).count()));
|
||||||
|
group.bench_function("unicode-normalization", |b| b.iter(|| ASCII.nfd().count()));
|
||||||
|
|
||||||
|
group.finish();
|
||||||
|
|
||||||
|
let long = fs::read_to_string("benches/long.txt").unwrap();
|
||||||
|
|
||||||
|
let mut group = c.benchmark_group("Long");
|
||||||
|
|
||||||
|
group.bench_function("unf", |b| b.iter(|| nfd(&long).count()));
|
||||||
|
group.bench_function("unicode-normalization", |b| b.iter(|| long.nfd().count()));
|
||||||
|
|
||||||
|
group.finish();
|
||||||
|
}
|
||||||
|
|
||||||
|
criterion_group!(benches, criterion_benchmark);
|
||||||
|
criterion_main!(benches);
|
||||||
2366
crates/u-norm/benches/long.txt
Normal file
2366
crates/u-norm/benches/long.txt
Normal file
File diff suppressed because one or more lines are too long
@@ -2,7 +2,7 @@ use std::env;
|
|||||||
use std::fs;
|
use std::fs;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
use ufst::raw::Fst;
|
use u_fst::raw::Fst;
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
let data = fs::read_to_string("data/UnicodeData.txt").unwrap();
|
let data = fs::read_to_string("data/UnicodeData.txt").unwrap();
|
||||||
@@ -6,8 +6,6 @@ use tinyvec::TinyVec;
|
|||||||
|
|
||||||
pub mod table;
|
pub mod table;
|
||||||
|
|
||||||
use table::Decomposition;
|
|
||||||
|
|
||||||
pub fn nfd(s: &str) -> Decompositions<Chars<'_>> {
|
pub fn nfd(s: &str) -> Decompositions<Chars<'_>> {
|
||||||
Decompositions {
|
Decompositions {
|
||||||
iter: s.chars().fuse(),
|
iter: s.chars().fuse(),
|
||||||
@@ -28,6 +26,7 @@ impl Buffer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
fn push_back(&mut self, ch: char) {
|
fn push_back(&mut self, ch: char) {
|
||||||
let class = table::lookup(ch).combining_class();
|
let class = table::lookup(ch).combining_class();
|
||||||
|
|
||||||
@@ -41,10 +40,12 @@ impl Buffer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
fn sort_pending(&mut self) {
|
fn sort_pending(&mut self) {
|
||||||
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
|
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
fn reset(&mut self) {
|
fn reset(&mut self) {
|
||||||
let pending = self.buffer.len() - self.ready.end;
|
let pending = self.buffer.len() - self.ready.end;
|
||||||
|
|
||||||
@@ -56,6 +57,7 @@ impl Buffer {
|
|||||||
self.ready = 0..0;
|
self.ready = 0..0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
fn increment_next_ready(&mut self) {
|
fn increment_next_ready(&mut self) {
|
||||||
let next = self.ready.start + 1;
|
let next = self.ready.start + 1;
|
||||||
|
|
||||||
@@ -134,14 +136,8 @@ fn decompose(c: char, buffer: &mut Buffer) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if let Some(decomposed) = table::lookup(c).decomposition() {
|
if let Some(decomposed) = table::lookup(c).decomposition() {
|
||||||
match decomposed {
|
for d in decomposed {
|
||||||
Decomposition::Single(f) => {
|
decompose(d, buffer);
|
||||||
decompose(f, buffer);
|
|
||||||
}
|
|
||||||
Decomposition::Double(f, s) => {
|
|
||||||
decompose(f, buffer);
|
|
||||||
decompose(s, buffer);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
14
crates/u-norm/src/main.rs
Normal file
14
crates/u-norm/src/main.rs
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
use u_norm::table;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
for c in '\x00'..='\x7f' {
|
||||||
|
let d = table::lookup(c);
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"{:?} class: {}, decomp: {:?}",
|
||||||
|
c,
|
||||||
|
d.combining_class(),
|
||||||
|
d.decomposition().map(|d| d.collect::<Vec<_>>())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,8 +1,9 @@
|
|||||||
use ufst::raw::Fst;
|
use u_fst::raw::Fst;
|
||||||
|
|
||||||
const TABLE: Fst<&'static [u8]> =
|
const TABLE: Fst<&'static [u8]> =
|
||||||
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
|
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
pub fn lookup(ch: char) -> Entry {
|
pub fn lookup(ch: char) -> Entry {
|
||||||
Entry::new(
|
Entry::new(
|
||||||
TABLE
|
TABLE
|
||||||
@@ -13,9 +14,23 @@ pub fn lookup(ch: char) -> Entry {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy, PartialEq, Debug)]
|
#[derive(Clone, Copy, PartialEq, Debug)]
|
||||||
pub enum Decomposition {
|
pub struct Decomposition(u64);
|
||||||
Single(char),
|
|
||||||
Double(char, char),
|
impl Iterator for Decomposition {
|
||||||
|
type Item = char;
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
let d = (self.0 & 0x1FFFFF) as u32;
|
||||||
|
|
||||||
|
if d > 0 {
|
||||||
|
self.0 >>= 21;
|
||||||
|
|
||||||
|
Some(unsafe { char::from_u32_unchecked(d) })
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy, PartialEq, Debug)]
|
#[derive(Clone, Copy, PartialEq, Debug)]
|
||||||
@@ -26,26 +41,17 @@ impl Entry {
|
|||||||
Self(data)
|
Self(data)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
pub fn combining_class(&self) -> u8 {
|
pub fn combining_class(&self) -> u8 {
|
||||||
(self.0 & 0xFF) as u8
|
(self.0 & 0xFF) as u8
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn decomposition(&self) -> Option<Decomposition> {
|
#[inline(always)]
|
||||||
let m1 = ((self.0 >> 8) & 0x1FFFFF) as u32;
|
pub fn decomposition(&self) -> Option<Decomposition> {
|
||||||
|
let data = self.0 >> 8;
|
||||||
|
|
||||||
if m1 > 0 {
|
if data > 0 {
|
||||||
let m2 = ((self.0 >> 29) & 0x1FFFFF) as u32;
|
Some(Decomposition(data))
|
||||||
|
|
||||||
if m2 > 0 {
|
|
||||||
unsafe {
|
|
||||||
Some(Decomposition::Double(
|
|
||||||
char::from_u32_unchecked(m1),
|
|
||||||
char::from_u32_unchecked(m2),
|
|
||||||
))
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
unsafe { Some(Decomposition::Single(char::from_u32_unchecked(m1))) }
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
@@ -99,16 +105,14 @@ mod tests {
|
|||||||
|
|
||||||
prop_assert_eq!(b.combining_class(), combining_class, "data = {:064b}", data);
|
prop_assert_eq!(b.combining_class(), combining_class, "data = {:064b}", data);
|
||||||
|
|
||||||
|
let c = b.decomposition().map(|i| i.collect::<Vec<_>>());
|
||||||
|
|
||||||
match mapping_count {
|
match mapping_count {
|
||||||
0 => prop_assert_eq!(b.decomposition(), None, "data = {:064b}", data),
|
0 => prop_assert_eq!(c, None, "data = {:064b}", data),
|
||||||
1 => prop_assert_eq!(b.decomposition(), Some(Decomposition::Single(decomposition_first)), "data = {:064b}", data),
|
1 => prop_assert_eq!(c, Some(vec![decomposition_first]), "data = {:064b}", data),
|
||||||
2 => prop_assert_eq!(b.decomposition(), Some(Decomposition::Double(decomposition_first, decomposition_second)), "data = {:064b}", data),
|
2 => prop_assert_eq!(c, Some(vec![decomposition_first, decomposition_second]), "data = {:064b}", data),
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// prop_assert_eq!(a, b, "data = {:064b}", data);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1,16 +1,17 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "smol-uca"
|
name = "u-sort"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
ufst = { path = "../ufst" }
|
parse = { path = "../parse" }
|
||||||
unf = { path = "../unf" }
|
u-fst = { path = "../u-fst" }
|
||||||
|
u-norm = { path = "../u-norm" }
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
bytemuck = "1.9.1"
|
bytemuck = "1.9.1"
|
||||||
parse = { path = "../parse" }
|
parse = { path = "../parse" }
|
||||||
ufst = { path = "../ufst" }
|
u-fst = { path = "../u-fst" }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
proptest = "1.0.0"
|
proptest = "1.0.0"
|
||||||
@@ -3,7 +3,7 @@ use std::fs;
|
|||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
use parse::uca::allkeys;
|
use parse::uca::allkeys;
|
||||||
use ufst::raw::Builder;
|
use u_fst::raw::Builder;
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
println!("cargo:rerun-if-changed=data/allkeys.txt");
|
println!("cargo:rerun-if-changed=data/allkeys.txt");
|
||||||
@@ -52,7 +52,8 @@ impl Collator {
|
|||||||
|
|
||||||
// advance to last combining C
|
// advance to last combining C
|
||||||
while tail_index < code_points_len {
|
while tail_index < code_points_len {
|
||||||
let combining_class = unf::table::lookup(code_points[tail_index]).combining_class();
|
let combining_class =
|
||||||
|
u_norm::table::lookup(code_points[tail_index]).combining_class();
|
||||||
|
|
||||||
if debug {
|
if debug {
|
||||||
eprintln!(
|
eprintln!(
|
||||||
@@ -282,7 +283,7 @@ impl Collator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn sort_key<S: AsRef<str>>(&self, input: S) -> Vec<u16> {
|
pub fn sort_key<S: AsRef<str>>(&self, input: S) -> Vec<u16> {
|
||||||
let normalized = unf::nfd(input.as_ref()).collect::<Vec<_>>();
|
let normalized = u_norm::nfd(input.as_ref()).collect::<Vec<_>>();
|
||||||
let collation_elements = self.collation_elements(&normalized);
|
let collation_elements = self.collation_elements(&normalized);
|
||||||
|
|
||||||
self.sort_key_from_collation_elements(&collation_elements)
|
self.sort_key_from_collation_elements(&collation_elements)
|
||||||
@@ -346,7 +347,7 @@ mod tests {
|
|||||||
"04D0 | 0020 004A 0024 | 0002 0002 0002 |",
|
"04D0 | 0020 004A 0024 | 0002 0002 0002 |",
|
||||||
fmt(&sort_key),
|
fmt(&sort_key),
|
||||||
"nfd: {:?}",
|
"nfd: {:?}",
|
||||||
unf::nfd(fixture)
|
u_norm::nfd(fixture)
|
||||||
.map(|ch| format!("{:04X}", ch as u32))
|
.map(|ch| format!("{:04X}", ch as u32))
|
||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
.join(" ")
|
.join(" ")
|
||||||
@@ -364,7 +365,7 @@ mod tests {
|
|||||||
"FB00 9D00 0268 | 0020 0020 | 0002 0002 |",
|
"FB00 9D00 0268 | 0020 0020 | 0002 0002 |",
|
||||||
fmt(&sort_key),
|
fmt(&sort_key),
|
||||||
"nfd: {:?}",
|
"nfd: {:?}",
|
||||||
unf::nfd(fixture)
|
u_norm::nfd(fixture)
|
||||||
.map(|ch| format!("{:04X}", ch as u32))
|
.map(|ch| format!("{:04X}", ch as u32))
|
||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
.join(" ")
|
.join(" ")
|
||||||
52
crates/u-sort/src/main.rs
Normal file
52
crates/u-sort/src/main.rs
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
use std::cmp;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::fs;
|
||||||
|
|
||||||
|
use parse::uca::allkeys;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let allkeys = {
|
||||||
|
let data = fs::read_to_string("data/allkeys.txt").unwrap();
|
||||||
|
|
||||||
|
allkeys::parse(&data)
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut l1 = 0;
|
||||||
|
let mut l2 = 0;
|
||||||
|
let mut l3 = 0;
|
||||||
|
let mut l4 = 0;
|
||||||
|
|
||||||
|
let mut count = HashMap::new();
|
||||||
|
|
||||||
|
for entry in allkeys.entries {
|
||||||
|
count
|
||||||
|
.entry(entry.elements.len())
|
||||||
|
.and_modify(|x| *x += 1)
|
||||||
|
.or_insert(1);
|
||||||
|
|
||||||
|
for element in entry.elements {
|
||||||
|
l1 = cmp::max(l1, element.l1);
|
||||||
|
l2 = cmp::max(l2, element.l2);
|
||||||
|
l3 = cmp::max(l3, element.l3);
|
||||||
|
l4 = cmp::max(l4, element.l4);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
l1 = 16 bits
|
||||||
|
l2 = 9 bits
|
||||||
|
l3 = 5 bits
|
||||||
|
l4 = 0 bits
|
||||||
|
variable = 1 bit
|
||||||
|
|
||||||
|
total = 31 bits
|
||||||
|
*/
|
||||||
|
|
||||||
|
println!("l1: {} - {} bit(s)", l1, u16::BITS - l1.leading_zeros());
|
||||||
|
println!("l2: {} - {} bit(s)", l2, u16::BITS - l2.leading_zeros());
|
||||||
|
println!("l3: {} - {} bit(s)", l3, u8::BITS - l3.leading_zeros());
|
||||||
|
println!("l4: {} - {} bit(s)", l4, u16::BITS - l4.leading_zeros());
|
||||||
|
println!("variable: 1 bit(s)");
|
||||||
|
println!();
|
||||||
|
println!("{:#?}", count);
|
||||||
|
}
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
use std::fmt::Display;
|
use std::fmt::Display;
|
||||||
|
|
||||||
use ufst::raw::{Fst, Output};
|
use u_fst::raw::{Fst, Output};
|
||||||
|
|
||||||
const TABLE: Fst<&'static [u8]> =
|
const TABLE: Fst<&'static [u8]> =
|
||||||
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
|
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{BufRead, BufReader};
|
use std::io::{BufRead, BufReader};
|
||||||
|
|
||||||
use smol_uca::collator::Collator;
|
use u_sort::collator::Collator;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn collation_test_non_ignorable() {
|
fn collation_test_non_ignorable() {
|
||||||
@@ -55,7 +55,7 @@ fn collation_test_non_ignorable() {
|
|||||||
let expected_sort_key = rest.rsplit(['[', ']']).nth(1).expect("sort key");
|
let expected_sort_key = rest.rsplit(['[', ']']).nth(1).expect("sort key");
|
||||||
|
|
||||||
let sort_key = collator.sort_key(&test_string);
|
let sort_key = collator.sort_key(&test_string);
|
||||||
let fmt_sort_key = smol_uca::fmt(&sort_key);
|
let fmt_sort_key = u_sort::fmt(&sort_key);
|
||||||
|
|
||||||
if let Some(prev_sort_key) = prev_sort_key.take() {
|
if let Some(prev_sort_key) = prev_sort_key.take() {
|
||||||
if sort_key < prev_sort_key {
|
if sort_key < prev_sort_key {
|
||||||
@@ -76,7 +76,7 @@ fn collation_test_non_ignorable() {
|
|||||||
eprintln!(
|
eprintln!(
|
||||||
"Error at line {}: {:?} expected: [{}], got: [{}] ({})",
|
"Error at line {}: {:?} expected: [{}], got: [{}] ({})",
|
||||||
n + 1,
|
n + 1,
|
||||||
unf::nfd(&test_string).collect::<String>(),
|
u_norm::nfd(&test_string).collect::<String>(),
|
||||||
expected_sort_key,
|
expected_sort_key,
|
||||||
fmt_sort_key,
|
fmt_sort_key,
|
||||||
line
|
line
|
||||||
Reference in New Issue
Block a user