Rename crates

This commit is contained in:
2022-05-24 20:58:27 +02:00
parent 8a8baffba8
commit 9f44196e6c
51 changed files with 2531 additions and 54 deletions

View File

@@ -1,5 +1,5 @@
[package] [package]
name = "ufst" name = "u-fst"
version = "0.4.7" #:version version = "0.4.7" #:version
authors = ["Andrew Gallant <jamslam@gmail.com>", "Anders Olsson <anders.e.olsson@gmail.com>"] authors = ["Andrew Gallant <jamslam@gmail.com>", "Anders Olsson <anders.e.olsson@gmail.com>"]
description = """ description = """

View File

@@ -1,16 +1,25 @@
[package] [package]
name = "unf" name = "u-norm"
version = "0.1.0" version = "0.1.0"
edition = "2021" edition = "2021"
[lib]
bench = false
[[bench]]
name = "bench"
harness = false
[dependencies] [dependencies]
fst = "0.4.7" fst = "0.4.7"
tinyvec = { version = "1.6.0", features = ["alloc"] } tinyvec = { version = "1.6.0", features = ["alloc"] }
ufst = { path = "../ufst" } u-fst = { path = "../u-fst" }
[build-dependencies] [build-dependencies]
ufst = { path = "../ufst" } u-fst = { path = "../u-fst" }
[dev-dependencies] [dev-dependencies]
criterion = "0.3.5"
proptest = "1.0.0" proptest = "1.0.0"
similar-asserts = "1.2.0" similar-asserts = "1.2.0"
unicode-normalization = "0.1.19"

5
crates/u-norm/README.md Normal file
View File

@@ -0,0 +1,5 @@
# UNF
## Todo
- [ ] Change Decomposition to be `struct Decomposition(u64)` that implements `Iterator`

View File

@@ -0,0 +1,29 @@
use std::fs;
use criterion::{criterion_group, criterion_main, Criterion};
use u_norm::nfd;
use unicode_normalization::UnicodeNormalization;
const ASCII: &str = "all types of normalized";
fn criterion_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("ASCII");
group.bench_function("unf", |b| b.iter(|| nfd(ASCII).count()));
group.bench_function("unicode-normalization", |b| b.iter(|| ASCII.nfd().count()));
group.finish();
let long = fs::read_to_string("benches/long.txt").unwrap();
let mut group = c.benchmark_group("Long");
group.bench_function("unf", |b| b.iter(|| nfd(&long).count()));
group.bench_function("unicode-normalization", |b| b.iter(|| long.nfd().count()));
group.finish();
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

File diff suppressed because one or more lines are too long

View File

@@ -2,7 +2,7 @@ use std::env;
use std::fs; use std::fs;
use std::path::Path; use std::path::Path;
use ufst::raw::Fst; use u_fst::raw::Fst;
fn main() { fn main() {
let data = fs::read_to_string("data/UnicodeData.txt").unwrap(); let data = fs::read_to_string("data/UnicodeData.txt").unwrap();

View File

@@ -6,8 +6,6 @@ use tinyvec::TinyVec;
pub mod table; pub mod table;
use table::Decomposition;
pub fn nfd(s: &str) -> Decompositions<Chars<'_>> { pub fn nfd(s: &str) -> Decompositions<Chars<'_>> {
Decompositions { Decompositions {
iter: s.chars().fuse(), iter: s.chars().fuse(),
@@ -28,6 +26,7 @@ impl Buffer {
} }
} }
#[inline(always)]
fn push_back(&mut self, ch: char) { fn push_back(&mut self, ch: char) {
let class = table::lookup(ch).combining_class(); let class = table::lookup(ch).combining_class();
@@ -41,10 +40,12 @@ impl Buffer {
} }
} }
#[inline(always)]
fn sort_pending(&mut self) { fn sort_pending(&mut self) {
self.buffer[self.ready.end..].sort_by_key(|k| k.0); self.buffer[self.ready.end..].sort_by_key(|k| k.0);
} }
#[inline(always)]
fn reset(&mut self) { fn reset(&mut self) {
let pending = self.buffer.len() - self.ready.end; let pending = self.buffer.len() - self.ready.end;
@@ -56,6 +57,7 @@ impl Buffer {
self.ready = 0..0; self.ready = 0..0;
} }
#[inline(always)]
fn increment_next_ready(&mut self) { fn increment_next_ready(&mut self) {
let next = self.ready.start + 1; let next = self.ready.start + 1;
@@ -134,14 +136,8 @@ fn decompose(c: char, buffer: &mut Buffer) {
} }
if let Some(decomposed) = table::lookup(c).decomposition() { if let Some(decomposed) = table::lookup(c).decomposition() {
match decomposed { for d in decomposed {
Decomposition::Single(f) => { decompose(d, buffer);
decompose(f, buffer);
}
Decomposition::Double(f, s) => {
decompose(f, buffer);
decompose(s, buffer);
}
} }
return; return;
} }

14
crates/u-norm/src/main.rs Normal file
View File

@@ -0,0 +1,14 @@
use u_norm::table;
fn main() {
for c in '\x00'..='\x7f' {
let d = table::lookup(c);
println!(
"{:?} class: {}, decomp: {:?}",
c,
d.combining_class(),
d.decomposition().map(|d| d.collect::<Vec<_>>())
);
}
}

View File

@@ -1,8 +1,9 @@
use ufst::raw::Fst; use u_fst::raw::Fst;
const TABLE: Fst<&'static [u8]> = const TABLE: Fst<&'static [u8]> =
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst"))); Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
#[inline(always)]
pub fn lookup(ch: char) -> Entry { pub fn lookup(ch: char) -> Entry {
Entry::new( Entry::new(
TABLE TABLE
@@ -13,9 +14,23 @@ pub fn lookup(ch: char) -> Entry {
} }
#[derive(Clone, Copy, PartialEq, Debug)] #[derive(Clone, Copy, PartialEq, Debug)]
pub enum Decomposition { pub struct Decomposition(u64);
Single(char),
Double(char, char), impl Iterator for Decomposition {
type Item = char;
#[inline(always)]
fn next(&mut self) -> Option<Self::Item> {
let d = (self.0 & 0x1FFFFF) as u32;
if d > 0 {
self.0 >>= 21;
Some(unsafe { char::from_u32_unchecked(d) })
} else {
None
}
}
} }
#[derive(Clone, Copy, PartialEq, Debug)] #[derive(Clone, Copy, PartialEq, Debug)]
@@ -26,26 +41,17 @@ impl Entry {
Self(data) Self(data)
} }
#[inline(always)]
pub fn combining_class(&self) -> u8 { pub fn combining_class(&self) -> u8 {
(self.0 & 0xFF) as u8 (self.0 & 0xFF) as u8
} }
pub(crate) fn decomposition(&self) -> Option<Decomposition> { #[inline(always)]
let m1 = ((self.0 >> 8) & 0x1FFFFF) as u32; pub fn decomposition(&self) -> Option<Decomposition> {
let data = self.0 >> 8;
if m1 > 0 { if data > 0 {
let m2 = ((self.0 >> 29) & 0x1FFFFF) as u32; Some(Decomposition(data))
if m2 > 0 {
unsafe {
Some(Decomposition::Double(
char::from_u32_unchecked(m1),
char::from_u32_unchecked(m2),
))
}
} else {
unsafe { Some(Decomposition::Single(char::from_u32_unchecked(m1))) }
}
} else { } else {
None None
} }
@@ -99,16 +105,14 @@ mod tests {
prop_assert_eq!(b.combining_class(), combining_class, "data = {:064b}", data); prop_assert_eq!(b.combining_class(), combining_class, "data = {:064b}", data);
let c = b.decomposition().map(|i| i.collect::<Vec<_>>());
match mapping_count { match mapping_count {
0 => prop_assert_eq!(b.decomposition(), None, "data = {:064b}", data), 0 => prop_assert_eq!(c, None, "data = {:064b}", data),
1 => prop_assert_eq!(b.decomposition(), Some(Decomposition::Single(decomposition_first)), "data = {:064b}", data), 1 => prop_assert_eq!(c, Some(vec![decomposition_first]), "data = {:064b}", data),
2 => prop_assert_eq!(b.decomposition(), Some(Decomposition::Double(decomposition_first, decomposition_second)), "data = {:064b}", data), 2 => prop_assert_eq!(c, Some(vec![decomposition_first, decomposition_second]), "data = {:064b}", data),
_ => unreachable!(), _ => unreachable!(),
} }
// prop_assert_eq!(a, b, "data = {:064b}", data);
} }
} }
} }

View File

@@ -1,16 +1,17 @@
[package] [package]
name = "smol-uca" name = "u-sort"
version = "0.1.0" version = "0.1.0"
edition = "2021" edition = "2021"
[dependencies] [dependencies]
ufst = { path = "../ufst" } parse = { path = "../parse" }
unf = { path = "../unf" } u-fst = { path = "../u-fst" }
u-norm = { path = "../u-norm" }
[build-dependencies] [build-dependencies]
bytemuck = "1.9.1" bytemuck = "1.9.1"
parse = { path = "../parse" } parse = { path = "../parse" }
ufst = { path = "../ufst" } u-fst = { path = "../u-fst" }
[dev-dependencies] [dev-dependencies]
proptest = "1.0.0" proptest = "1.0.0"

View File

@@ -3,7 +3,7 @@ use std::fs;
use std::path::Path; use std::path::Path;
use parse::uca::allkeys; use parse::uca::allkeys;
use ufst::raw::Builder; use u_fst::raw::Builder;
fn main() { fn main() {
println!("cargo:rerun-if-changed=data/allkeys.txt"); println!("cargo:rerun-if-changed=data/allkeys.txt");

View File

@@ -52,7 +52,8 @@ impl Collator {
// advance to last combining C // advance to last combining C
while tail_index < code_points_len { while tail_index < code_points_len {
let combining_class = unf::table::lookup(code_points[tail_index]).combining_class(); let combining_class =
u_norm::table::lookup(code_points[tail_index]).combining_class();
if debug { if debug {
eprintln!( eprintln!(
@@ -282,7 +283,7 @@ impl Collator {
} }
pub fn sort_key<S: AsRef<str>>(&self, input: S) -> Vec<u16> { pub fn sort_key<S: AsRef<str>>(&self, input: S) -> Vec<u16> {
let normalized = unf::nfd(input.as_ref()).collect::<Vec<_>>(); let normalized = u_norm::nfd(input.as_ref()).collect::<Vec<_>>();
let collation_elements = self.collation_elements(&normalized); let collation_elements = self.collation_elements(&normalized);
self.sort_key_from_collation_elements(&collation_elements) self.sort_key_from_collation_elements(&collation_elements)
@@ -346,7 +347,7 @@ mod tests {
"04D0 | 0020 004A 0024 | 0002 0002 0002 |", "04D0 | 0020 004A 0024 | 0002 0002 0002 |",
fmt(&sort_key), fmt(&sort_key),
"nfd: {:?}", "nfd: {:?}",
unf::nfd(fixture) u_norm::nfd(fixture)
.map(|ch| format!("{:04X}", ch as u32)) .map(|ch| format!("{:04X}", ch as u32))
.collect::<Vec<_>>() .collect::<Vec<_>>()
.join(" ") .join(" ")
@@ -364,7 +365,7 @@ mod tests {
"FB00 9D00 0268 | 0020 0020 | 0002 0002 |", "FB00 9D00 0268 | 0020 0020 | 0002 0002 |",
fmt(&sort_key), fmt(&sort_key),
"nfd: {:?}", "nfd: {:?}",
unf::nfd(fixture) u_norm::nfd(fixture)
.map(|ch| format!("{:04X}", ch as u32)) .map(|ch| format!("{:04X}", ch as u32))
.collect::<Vec<_>>() .collect::<Vec<_>>()
.join(" ") .join(" ")

52
crates/u-sort/src/main.rs Normal file
View File

@@ -0,0 +1,52 @@
use std::cmp;
use std::collections::HashMap;
use std::fs;
use parse::uca::allkeys;
fn main() {
let allkeys = {
let data = fs::read_to_string("data/allkeys.txt").unwrap();
allkeys::parse(&data)
};
let mut l1 = 0;
let mut l2 = 0;
let mut l3 = 0;
let mut l4 = 0;
let mut count = HashMap::new();
for entry in allkeys.entries {
count
.entry(entry.elements.len())
.and_modify(|x| *x += 1)
.or_insert(1);
for element in entry.elements {
l1 = cmp::max(l1, element.l1);
l2 = cmp::max(l2, element.l2);
l3 = cmp::max(l3, element.l3);
l4 = cmp::max(l4, element.l4);
}
}
/*
l1 = 16 bits
l2 = 9 bits
l3 = 5 bits
l4 = 0 bits
variable = 1 bit
total = 31 bits
*/
println!("l1: {} - {} bit(s)", l1, u16::BITS - l1.leading_zeros());
println!("l2: {} - {} bit(s)", l2, u16::BITS - l2.leading_zeros());
println!("l3: {} - {} bit(s)", l3, u8::BITS - l3.leading_zeros());
println!("l4: {} - {} bit(s)", l4, u16::BITS - l4.leading_zeros());
println!("variable: 1 bit(s)");
println!();
println!("{:#?}", count);
}

View File

@@ -1,6 +1,6 @@
use std::fmt::Display; use std::fmt::Display;
use ufst::raw::{Fst, Output}; use u_fst::raw::{Fst, Output};
const TABLE: Fst<&'static [u8]> = const TABLE: Fst<&'static [u8]> =
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst"))); Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));

View File

@@ -1,7 +1,7 @@
use std::fs::File; use std::fs::File;
use std::io::{BufRead, BufReader}; use std::io::{BufRead, BufReader};
use smol_uca::collator::Collator; use u_sort::collator::Collator;
#[test] #[test]
fn collation_test_non_ignorable() { fn collation_test_non_ignorable() {
@@ -55,7 +55,7 @@ fn collation_test_non_ignorable() {
let expected_sort_key = rest.rsplit(['[', ']']).nth(1).expect("sort key"); let expected_sort_key = rest.rsplit(['[', ']']).nth(1).expect("sort key");
let sort_key = collator.sort_key(&test_string); let sort_key = collator.sort_key(&test_string);
let fmt_sort_key = smol_uca::fmt(&sort_key); let fmt_sort_key = u_sort::fmt(&sort_key);
if let Some(prev_sort_key) = prev_sort_key.take() { if let Some(prev_sort_key) = prev_sort_key.take() {
if sort_key < prev_sort_key { if sort_key < prev_sort_key {
@@ -76,7 +76,7 @@ fn collation_test_non_ignorable() {
eprintln!( eprintln!(
"Error at line {}: {:?} expected: [{}], got: [{}] ({})", "Error at line {}: {:?} expected: [{}], got: [{}] ({})",
n + 1, n + 1,
unf::nfd(&test_string).collect::<String>(), u_norm::nfd(&test_string).collect::<String>(),
expected_sort_key, expected_sort_key,
fmt_sort_key, fmt_sort_key,
line line