Rename crates

This commit is contained in:
2022-05-24 20:58:27 +02:00
parent 8a8baffba8
commit 9f44196e6c
51 changed files with 2531 additions and 54 deletions

18
crates/u-sort/Cargo.toml Normal file
View File

@@ -0,0 +1,18 @@
[package]
name = "u-sort"
version = "0.1.0"
edition = "2021"
[dependencies]
parse = { path = "../parse" }
u-fst = { path = "../u-fst" }
u-norm = { path = "../u-norm" }
[build-dependencies]
bytemuck = "1.9.1"
parse = { path = "../parse" }
u-fst = { path = "../u-fst" }
[dev-dependencies]
proptest = "1.0.0"
similar-asserts = "1.2.0"

16
crates/u-sort/README.md Normal file
View File

@@ -0,0 +1,16 @@
# Smol UCA
Implementation of [Unicode Collation Algorithm](https://unicode.org/reports/tr10/).
## Todo
- [x] Build fst on build
- [x] Switch to ufst
- [ ] Add benchmarks
## See
- [ziglyph](https://github.com/jecolon/ziglyph/blob/main/src/collator/Collator.zig)
- [pyuca](https://github.com/jtauber/pyuca)
- [collate](https://github.com/tertsdiepraam/collate)
- [UCA Auxiliary Files](http://www.unicode.org/Public/UCA/6.0.0/CollationAuxiliary.html)

108
crates/u-sort/build.rs Normal file
View File

@@ -0,0 +1,108 @@
use std::env;
use std::fs;
use std::path::Path;
use parse::uca::allkeys;
use u_fst::raw::Builder;
fn main() {
println!("cargo:rerun-if-changed=data/allkeys.txt");
println!("cargo:rerun-if-changed=build.rs");
let all_keys = {
let data = std::fs::read_to_string("data/allkeys.txt").unwrap();
allkeys::parse(&data)
};
let out_dir = env::var_os("OUT_DIR").unwrap();
let mut implicit_weights = String::new();
implicit_weights.push_str(&format!(
"pub const IMPLICIT_WEIGHTS: [(u32, u32, u32); {}] = [\n",
all_keys.implicit_weights.len(),
));
for implicit_weight in all_keys.implicit_weights {
implicit_weights.push_str(&format!(
" ({}, {}, {}),\n",
implicit_weight.start, implicit_weight.end, implicit_weight.base
));
}
implicit_weights.push_str("];\n");
fs::write(
&Path::new(&out_dir).join("implicit_weights.rs"),
implicit_weights,
)
.unwrap();
let mut entries = all_keys
.entries
.into_iter()
.map(|entry| {
(
bytemuck::cast_slice::<u32, u8>(&entry.chars).to_vec(),
entry.elements,
)
})
.collect::<Vec<_>>();
entries.sort_unstable_by(|(a, _), (b, _)| a.cmp(b));
let mut builder = Builder::memory();
let mut overflow = Vec::new();
for (chars, mut elements) in entries.into_iter() {
let value = match elements.len() {
1 => {
let element = elements.pop().unwrap();
((element.l4 as u64) << 42)
| ((element.l3 as u64) << 34)
| ((element.l2 as u64) << 18)
| ((element.l1 as u64) << 2)
| (if element.variable { 1 } else { 0 } << 1)
}
2.. => {
let idx = overflow.len();
let len = elements.len();
overflow.extend(elements.into_iter());
((idx as u64) << 9) | ((len as u64) << 1) | 1
}
_ => panic!("this shouldn't happen!"),
};
builder.insert(chars, value).unwrap();
}
let data = builder.into_fst().into_inner();
fs::write(&Path::new(&out_dir).join("table.fst"), data).unwrap();
let mut explicit_weights = String::new();
explicit_weights.push_str(&format!(
"pub const EXPLICIT_WEIGHTS: [(u16, u16, u8, u16, bool); {}] = [\n",
overflow.len(),
));
for element in overflow {
explicit_weights.push_str(&format!(
" ({}, {}, {}, {}, {}),\n",
element.l1, element.l2, element.l3, element.l4, element.variable,
));
}
explicit_weights.push_str("];\n");
fs::write(
&Path::new(&out_dir).join("explicit_weights.rs"),
explicit_weights,
)
.unwrap();
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

33925
crates/u-sort/data/allkeys.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,374 @@
use std::iter;
use crate::table::{self, Element};
use crate::weights::IMPLICIT_WEIGHTS;
#[derive(Default)]
pub struct Collator;
impl Collator {
fn collation_elements(&self, normalized: &[char]) -> Vec<Element> {
let debug = false;
let mut all_elements = Vec::new();
let mut code_points = normalized.to_vec();
let mut code_points_len = code_points.len();
let mut cp_index = 0;
if debug {
eprintln!(
"nfd: {}",
normalized
.iter()
.map(|c| format!("{:04X}", *c as u32))
.collect::<Vec<_>>()
.join(" ")
);
}
while cp_index < code_points_len {
let (mut elements, idx) = table::lookup(&code_points[cp_index..]).unwrap_or_default();
if debug {
eprintln!(
"found at 1: [{}], idx: {}, start: {:04X}",
elements
.iter()
.map(ToString::to_string)
.collect::<Vec<_>>()
.join("]["),
idx,
code_points[cp_index] as u32
);
}
let s = &code_points[0..cp_index + idx + 1];
// handle non-starters
let mut last_class = None;
let tail_start = cp_index + idx + 1;
let mut tail_index = tail_start;
// advance to last combining C
while tail_index < code_points_len {
let combining_class =
u_norm::table::lookup(code_points[tail_index]).combining_class();
if debug {
eprintln!(
"combining class: {}, start: {:04X}",
combining_class, code_points[tail_index] as u32
);
}
if combining_class == 0 {
if tail_index != tail_start {
tail_index -= 1;
}
break;
}
if let Some(last_class) = last_class {
if last_class >= combining_class {
if tail_index != tail_start {
tail_index -= 1;
}
break;
}
}
last_class = Some(combining_class);
tail_index += 1;
}
if tail_index == code_points_len {
tail_index -= 1;
}
if tail_index > tail_start {
let c = code_points[tail_index];
let mut new_key = Vec::with_capacity(s.len() + 1);
new_key.extend_from_slice(s);
new_key.push(c);
if debug {
eprintln!(
"new key: {}, s: {}, c: {:04X}",
new_key
.iter()
.map(|c| format!("{:04X}", *c as u32))
.collect::<Vec<_>>()
.join(" "),
s.iter()
.map(|c| format!("{:04X}", *c as u32))
.collect::<Vec<_>>()
.join(" "),
c as u32
);
}
let (new_elements, new_idx) = table::lookup(&new_key).expect("lookup");
if debug {
eprintln!(
"found at 2: [{}], idx: {}, start: {:04X}",
new_elements
.iter()
.map(ToString::to_string)
.collect::<Vec<_>>()
.join("]["),
new_idx,
new_key[0] as u32
);
}
if new_idx == (new_key.len() - 1) && !new_elements.is_empty() {
cp_index = tail_start;
// splice
let mut tmp = Vec::with_capacity(code_points_len - 1);
tmp.extend_from_slice(&code_points[0..tail_index]);
if tail_index + 1 < code_points_len {
tmp.extend_from_slice(&code_points[tail_index + 1..]);
}
code_points = tmp;
code_points_len = code_points.len();
if debug {
eprintln!("add part 2 elements to all");
}
// add elements to final collection
all_elements.extend(new_elements);
continue;
}
}
if elements.is_empty() {
if debug {
eprintln!("no part 1 elements, use implicit weight");
}
elements = self.implicit_weight(code_points[0] as u32);
if debug {
eprintln!(
"found at 3: [{}], start: {:04X}",
elements
.iter()
.map(ToString::to_string)
.collect::<Vec<_>>()
.join("]["),
code_points[0] as u32
);
}
}
if debug {
eprintln!("add part 1 elements to all");
}
// add elements to final collection
all_elements.extend(elements);
cp_index += idx + 1;
}
if debug {
eprintln!(
"all: [{}]",
all_elements
.iter()
.map(ToString::to_string)
.collect::<Vec<_>>()
.join("][")
);
}
all_elements
}
fn implicit_weight(&self, cp: u32) -> Vec<Element> {
let base;
let mut aaaa = None;
let mut bbbb = 0;
if is_unified_ideograph(cp)
&& ((cp >= 0x4E00 && cp <= 0x9FFF) || (cp >= 0xF900 && cp <= 0xFAFF))
{
base = 0xFB40;
aaaa = Some(base + (cp >> 15));
bbbb = (cp & 0x7FFF) | 0x8000;
} else if is_unified_ideograph(cp)
&& !((cp >= 0x4E00 && cp <= 0x9FFF) || (cp >= 0xF900 && cp <= 0xFAFF))
{
base = 0xFB80;
aaaa = Some(base + (cp >> 15));
bbbb = (cp & 0x7FFF) | 0x8000;
} else {
if let Some((start, _, base)) = IMPLICIT_WEIGHTS
.iter()
.find(|(start, end, _)| cp >= *start && cp <= *end)
{
aaaa = Some(*base);
bbbb = (cp - *start) | 0x8000;
if cp >= 0x18D00 && cp <= 0x18D8F {
bbbb = (cp - 0x17000) | 0x8000;
} else {
bbbb = (cp - *start) | 0x8000;
}
}
if aaaa.is_none() {
base = 0xFBC0;
aaaa = Some(base + (cp >> 15));
bbbb = (cp & 0x7FFF) | 0x8000;
}
}
vec![
Element {
l1: aaaa.map(|x| (x & 0xFFFF) as u16).unwrap_or(0),
l2: 0x0020,
l3: 0x0002,
l4: 0x0000,
variable: false,
},
Element {
l1: (bbbb & 0xFFFF) as u16,
l2: 0x0000,
l3: 0x0000,
l4: 0x0000,
variable: false,
},
]
}
fn sort_key_from_collation_elements(&self, collation_elements: &[Element]) -> Vec<u16> {
let l1 = collation_elements
.iter()
.map(|element| element.l1)
.filter(|x| *x > 0);
let l2 = collation_elements
.iter()
.map(|element| element.l2)
.filter(|x| *x > 0);
let l3 = collation_elements
.iter()
.map(|element| element.l3 as u16)
.filter(|x| *x > 0);
let l4 = collation_elements
.iter()
.map(|element| element.l4)
.filter(|x| *x > 0);
l1.chain(iter::once(0))
.chain(l2)
.chain(iter::once(0))
.chain(l3)
.chain(iter::once(0))
.chain(l4)
.collect()
}
pub fn sort_key<S: AsRef<str>>(&self, input: S) -> Vec<u16> {
let normalized = u_norm::nfd(input.as_ref()).collect::<Vec<_>>();
let collation_elements = self.collation_elements(&normalized);
self.sort_key_from_collation_elements(&collation_elements)
}
}
pub fn is_unified_ideograph(cp: u32) -> bool {
if cp < 0x3400 || cp > 0x3134a {
return false;
}
match cp {
0x3400..=0x4dbf => true,
0x4e00..=0x9fff => true,
0xfa0e..=0xfa0f => true,
0xfa11 => true,
0xfa13..=0xfa14 => true,
0xfa1f => true,
0xfa21 => true,
0xfa23..=0xfa24 => true,
0xfa27..=0xfa29 => true,
0x20000..=0x2a6df => true,
0x2a700..=0x2b738 => true,
0x2b740..=0x2b81d => true,
0x2b820..=0x2cea1 => true,
0x2ceb0..=0x2ebe0 => true,
0x30000..=0x3134a => true,
_ => false,
}
}
#[cfg(test)]
mod tests {
use crate::fmt;
use super::*;
#[test]
fn test_bug_001() {
let collator = Collator::default();
let fixture = "\u{1abc}\u{334}";
let sort_key = collator.sort_key(fixture);
similar_asserts::assert_eq!("| 004A 0033 | 0002 0002 |", fmt(&sort_key));
let fixture = "\u{1ac1}\u{334}";
let sort_key = collator.sort_key(fixture);
similar_asserts::assert_eq!("| 004A 0033 | 0002 0002 |", fmt(&sort_key));
}
#[test]
fn test_bug_002() {
let collator = Collator::default();
let fixture = "\u{a8}\u{301}\u{334}";
let sort_key = collator.sort_key(fixture);
similar_asserts::assert_eq!(
"04D0 | 0020 004A 0024 | 0002 0002 0002 |",
fmt(&sort_key),
"nfd: {:?}",
u_norm::nfd(fixture)
.map(|ch| format!("{:04X}", ch as u32))
.collect::<Vec<_>>()
.join(" ")
);
}
#[test]
fn test_bug_003() {
let collator = Collator::default();
let fixture = "\u{18d00}\u{21}";
let sort_key = collator.sort_key(fixture);
similar_asserts::assert_eq!(
"FB00 9D00 0268 | 0020 0020 | 0002 0002 |",
fmt(&sort_key),
"nfd: {:?}",
u_norm::nfd(fixture)
.map(|ch| format!("{:04X}", ch as u32))
.collect::<Vec<_>>()
.join(" ")
);
}
}

16
crates/u-sort/src/lib.rs Normal file
View File

@@ -0,0 +1,16 @@
pub mod collator;
mod table;
mod weights;
pub fn fmt(sort_key: &[u16]) -> String {
use std::borrow::Cow;
sort_key
.iter()
.map(|x| match x {
0 => Cow::Borrowed("|"),
_ => Cow::Owned(format!("{:04X}", x)),
})
.collect::<Vec<_>>()
.join(" ")
}

52
crates/u-sort/src/main.rs Normal file
View File

@@ -0,0 +1,52 @@
use std::cmp;
use std::collections::HashMap;
use std::fs;
use parse::uca::allkeys;
fn main() {
let allkeys = {
let data = fs::read_to_string("data/allkeys.txt").unwrap();
allkeys::parse(&data)
};
let mut l1 = 0;
let mut l2 = 0;
let mut l3 = 0;
let mut l4 = 0;
let mut count = HashMap::new();
for entry in allkeys.entries {
count
.entry(entry.elements.len())
.and_modify(|x| *x += 1)
.or_insert(1);
for element in entry.elements {
l1 = cmp::max(l1, element.l1);
l2 = cmp::max(l2, element.l2);
l3 = cmp::max(l3, element.l3);
l4 = cmp::max(l4, element.l4);
}
}
/*
l1 = 16 bits
l2 = 9 bits
l3 = 5 bits
l4 = 0 bits
variable = 1 bit
total = 31 bits
*/
println!("l1: {} - {} bit(s)", l1, u16::BITS - l1.leading_zeros());
println!("l2: {} - {} bit(s)", l2, u16::BITS - l2.leading_zeros());
println!("l3: {} - {} bit(s)", l3, u8::BITS - l3.leading_zeros());
println!("l4: {} - {} bit(s)", l4, u16::BITS - l4.leading_zeros());
println!("variable: 1 bit(s)");
println!();
println!("{:#?}", count);
}

161
crates/u-sort/src/table.rs Normal file
View File

@@ -0,0 +1,161 @@
use std::fmt::Display;
use u_fst::raw::{Fst, Output};
const TABLE: Fst<&'static [u8]> =
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
pub fn lookup(value: &[char]) -> Option<(Vec<Element>, usize)> {
let mut node = TABLE.root();
let mut out = Output::zero();
let mut last_match = None;
'char: for (i, &c) in value.iter().enumerate() {
for b in (c as u32).to_ne_bytes() {
if let Some(trans_index) = node.find_input(b) {
let t = node.transition(trans_index);
node = TABLE.node(t.addr);
out = out.cat(t.out);
if node.is_final() {
last_match = Some((out.cat(node.final_output()).value(), i));
}
} else {
break 'char;
}
}
}
last_match.map(|(data, idx)| {
(
match Value::from_u64(data) {
Value::Entry(element) => vec![element],
Value::Index(idx, len) => {
let start = idx as usize;
let end = start + len as usize;
crate::weights::EXPLICIT_WEIGHTS[start..end]
.iter()
.map(|(l1, l2, l3, l4, variable)| Element {
l1: *l1,
l2: *l2,
l3: *l3,
l4: *l4,
variable: *variable,
})
.collect()
}
},
idx,
)
})
}
#[derive(Clone, Copy, PartialEq, Debug)]
pub struct Element {
pub l1: u16,
pub l2: u16,
pub l3: u8,
pub l4: u16,
pub variable: bool,
}
impl Display for Element {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}{:04X}.{:04X}.{:04X}.{:04X}",
if self.variable { "*" } else { "." },
self.l1,
self.l2,
self.l3,
self.l4
)?;
Ok(())
}
}
#[derive(Clone, Copy, PartialEq, Debug)]
enum Value {
Entry(Element),
Index(u32, u8),
}
impl Value {
fn to_u64(self) -> u64 {
match self {
Self::Entry(element) => {
((element.l4 as u64) << 42)
| ((element.l3 as u64) << 34)
| ((element.l2 as u64) << 18)
| ((element.l1 as u64) << 2)
| (if element.variable { 1 } else { 0 } << 1)
}
Self::Index(idx, len) => ((idx as u64) << 9) | ((len as u64) << 1) | 1,
}
}
fn from_u64(data: u64) -> Self {
if (data & 1) == 0 {
let variable = ((data >> 1) & 1) == 1;
let l1 = ((data >> 2) & 0xFFFF) as u16;
let l2 = ((data >> 18) & 0xFFFF) as u16;
let l3 = ((data >> 34) & 0xFF) as u8;
let l4 = ((data >> 42) & 0xFFFF) as u16;
Self::Entry(Element {
l1,
l2,
l3,
l4,
variable,
})
} else {
let len = ((data >> 1) & 0xFF) as u8;
let idx = ((data >> 9) & 0xFFFFFFFF) as u32;
Self::Index(idx, len)
}
}
}
#[cfg(test)]
mod tests {
use proptest::prelude::*;
use super::*;
fn value_strategy() -> impl Strategy<Value = Value> {
prop_oneof![
(any::<u32>(), any::<u8>()).prop_map(|(idx, len)| Value::Index(idx, len)),
(
any::<u16>(),
any::<u16>(),
any::<u8>(),
any::<u16>(),
any::<bool>()
)
.prop_map(|(l1, l2, l3, l4, variable)| Value::Entry(Element {
l1,
l2,
l3,
l4,
variable
})),
]
}
proptest! {
#[test]
fn proptest_serialize_and_deserialize(a in value_strategy()) {
let data = a.to_u64();
let b = Value::from_u64(data);
prop_assert_eq!(a, b);
}
}
}

View File

@@ -0,0 +1,2 @@
include!(concat!(env!("OUT_DIR"), "/explicit_weights.rs"));
include!(concat!(env!("OUT_DIR"), "/implicit_weights.rs"));

View File

@@ -0,0 +1,91 @@
use std::fs::File;
use std::io::{BufRead, BufReader};
use u_sort::collator::Collator;
#[test]
fn collation_test_non_ignorable() {
let data = File::open("data/CollationTest_NON_IGNORABLE.txt")
.map(BufReader::new)
.expect("collation test data");
let collator = Collator::default();
let mut prev_sort_key = None;
let mut order_errors = 0;
let mut sort_key_errors = 0;
'line: for (n, line) in data.lines().enumerate() {
let line = line.expect("line");
let line = line.trim_start();
if line.is_empty() || line.starts_with('#') {
continue;
}
let (chars, rest) = line
.split_once(';')
.expect("a semicolon separated test line");
let mut surrogates = false;
let test_string = chars
.trim()
.split(' ')
.map(|x| u32::from_str_radix(x, 16).expect("a valid hex value"))
.map(|x| match char::from_u32(x) {
Some(ch) => ch,
None => {
if (0xD800u32..=0xDFFF).contains(&x) {
surrogates = true;
' '
} else {
panic!("{}", line)
}
}
})
.collect::<String>();
if surrogates {
continue 'line;
}
let expected_sort_key = rest.rsplit(['[', ']']).nth(1).expect("sort key");
let sort_key = collator.sort_key(&test_string);
let fmt_sort_key = u_sort::fmt(&sort_key);
if let Some(prev_sort_key) = prev_sort_key.take() {
if sort_key < prev_sort_key {
eprintln!(
"Error at line {}: {:?} [{}]",
n + 1,
test_string,
expected_sort_key
);
order_errors += 1;
}
}
prev_sort_key = Some(sort_key);
if fmt_sort_key != expected_sort_key {
eprintln!(
"Error at line {}: {:?} expected: [{}], got: [{}] ({})",
n + 1,
u_norm::nfd(&test_string).collect::<String>(),
expected_sort_key,
fmt_sort_key,
line
);
sort_key_errors += 1;
}
}
assert_eq!(order_errors, 0);
assert_eq!(sort_key_errors, 0);
}