Initial commit

This commit is contained in:
2022-05-19 23:26:00 +02:00
commit 8a8baffba8
53 changed files with 761345 additions and 0 deletions

View File

@@ -0,0 +1,17 @@
[package]
name = "smol-uca"
version = "0.1.0"
edition = "2021"
[dependencies]
ufst = { path = "../ufst" }
unf = { path = "../unf" }
[build-dependencies]
bytemuck = "1.9.1"
parse = { path = "../parse" }
ufst = { path = "../ufst" }
[dev-dependencies]
proptest = "1.0.0"
similar-asserts = "1.2.0"

16
crates/smol-uca/README.md Normal file
View File

@@ -0,0 +1,16 @@
# Smol UCA
Implementation of [Unicode Collation Algorithm](https://unicode.org/reports/tr10/).
## Todo
- [x] Build fst on build
- [x] Switch to ufst
- [ ] Add benchmarks
## See
- [ziglyph](https://github.com/jecolon/ziglyph/blob/main/src/collator/Collator.zig)
- [pyuca](https://github.com/jtauber/pyuca)
- [collate](https://github.com/tertsdiepraam/collate)
- [UCA Auxiliary Files](http://www.unicode.org/Public/UCA/6.0.0/CollationAuxiliary.html)

108
crates/smol-uca/build.rs Normal file
View File

@@ -0,0 +1,108 @@
use std::env;
use std::fs;
use std::path::Path;
use parse::uca::allkeys;
use ufst::raw::Builder;
fn main() {
println!("cargo:rerun-if-changed=data/allkeys.txt");
println!("cargo:rerun-if-changed=build.rs");
let all_keys = {
let data = std::fs::read_to_string("data/allkeys.txt").unwrap();
allkeys::parse(&data)
};
let out_dir = env::var_os("OUT_DIR").unwrap();
let mut implicit_weights = String::new();
implicit_weights.push_str(&format!(
"pub const IMPLICIT_WEIGHTS: [(u32, u32, u32); {}] = [\n",
all_keys.implicit_weights.len(),
));
for implicit_weight in all_keys.implicit_weights {
implicit_weights.push_str(&format!(
" ({}, {}, {}),\n",
implicit_weight.start, implicit_weight.end, implicit_weight.base
));
}
implicit_weights.push_str("];\n");
fs::write(
&Path::new(&out_dir).join("implicit_weights.rs"),
implicit_weights,
)
.unwrap();
let mut entries = all_keys
.entries
.into_iter()
.map(|entry| {
(
bytemuck::cast_slice::<u32, u8>(&entry.chars).to_vec(),
entry.elements,
)
})
.collect::<Vec<_>>();
entries.sort_unstable_by(|(a, _), (b, _)| a.cmp(b));
let mut builder = Builder::memory();
let mut overflow = Vec::new();
for (chars, mut elements) in entries.into_iter() {
let value = match elements.len() {
1 => {
let element = elements.pop().unwrap();
((element.l4 as u64) << 42)
| ((element.l3 as u64) << 34)
| ((element.l2 as u64) << 18)
| ((element.l1 as u64) << 2)
| (if element.variable { 1 } else { 0 } << 1)
}
2.. => {
let idx = overflow.len();
let len = elements.len();
overflow.extend(elements.into_iter());
((idx as u64) << 9) | ((len as u64) << 1) | 1
}
_ => panic!("this shouldn't happen!"),
};
builder.insert(chars, value).unwrap();
}
let data = builder.into_fst().into_inner();
fs::write(&Path::new(&out_dir).join("table.fst"), data).unwrap();
let mut explicit_weights = String::new();
explicit_weights.push_str(&format!(
"pub const EXPLICIT_WEIGHTS: [(u16, u16, u8, u16, bool); {}] = [\n",
overflow.len(),
));
for element in overflow {
explicit_weights.push_str(&format!(
" ({}, {}, {}, {}, {}),\n",
element.l1, element.l2, element.l3, element.l4, element.variable,
));
}
explicit_weights.push_str("];\n");
fs::write(
&Path::new(&out_dir).join("explicit_weights.rs"),
explicit_weights,
)
.unwrap();
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,373 @@
use std::iter;
use crate::table::{self, Element};
use crate::weights::IMPLICIT_WEIGHTS;
#[derive(Default)]
pub struct Collator;
impl Collator {
fn collation_elements(&self, normalized: &[char]) -> Vec<Element> {
let debug = false;
let mut all_elements = Vec::new();
let mut code_points = normalized.to_vec();
let mut code_points_len = code_points.len();
let mut cp_index = 0;
if debug {
eprintln!(
"nfd: {}",
normalized
.iter()
.map(|c| format!("{:04X}", *c as u32))
.collect::<Vec<_>>()
.join(" ")
);
}
while cp_index < code_points_len {
let (mut elements, idx) = table::lookup(&code_points[cp_index..]).unwrap_or_default();
if debug {
eprintln!(
"found at 1: [{}], idx: {}, start: {:04X}",
elements
.iter()
.map(ToString::to_string)
.collect::<Vec<_>>()
.join("]["),
idx,
code_points[cp_index] as u32
);
}
let s = &code_points[0..cp_index + idx + 1];
// handle non-starters
let mut last_class = None;
let tail_start = cp_index + idx + 1;
let mut tail_index = tail_start;
// advance to last combining C
while tail_index < code_points_len {
let combining_class = unf::table::lookup(code_points[tail_index]).combining_class();
if debug {
eprintln!(
"combining class: {}, start: {:04X}",
combining_class, code_points[tail_index] as u32
);
}
if combining_class == 0 {
if tail_index != tail_start {
tail_index -= 1;
}
break;
}
if let Some(last_class) = last_class {
if last_class >= combining_class {
if tail_index != tail_start {
tail_index -= 1;
}
break;
}
}
last_class = Some(combining_class);
tail_index += 1;
}
if tail_index == code_points_len {
tail_index -= 1;
}
if tail_index > tail_start {
let c = code_points[tail_index];
let mut new_key = Vec::with_capacity(s.len() + 1);
new_key.extend_from_slice(s);
new_key.push(c);
if debug {
eprintln!(
"new key: {}, s: {}, c: {:04X}",
new_key
.iter()
.map(|c| format!("{:04X}", *c as u32))
.collect::<Vec<_>>()
.join(" "),
s.iter()
.map(|c| format!("{:04X}", *c as u32))
.collect::<Vec<_>>()
.join(" "),
c as u32
);
}
let (new_elements, new_idx) = table::lookup(&new_key).expect("lookup");
if debug {
eprintln!(
"found at 2: [{}], idx: {}, start: {:04X}",
new_elements
.iter()
.map(ToString::to_string)
.collect::<Vec<_>>()
.join("]["),
new_idx,
new_key[0] as u32
);
}
if new_idx == (new_key.len() - 1) && !new_elements.is_empty() {
cp_index = tail_start;
// splice
let mut tmp = Vec::with_capacity(code_points_len - 1);
tmp.extend_from_slice(&code_points[0..tail_index]);
if tail_index + 1 < code_points_len {
tmp.extend_from_slice(&code_points[tail_index + 1..]);
}
code_points = tmp;
code_points_len = code_points.len();
if debug {
eprintln!("add part 2 elements to all");
}
// add elements to final collection
all_elements.extend(new_elements);
continue;
}
}
if elements.is_empty() {
if debug {
eprintln!("no part 1 elements, use implicit weight");
}
elements = self.implicit_weight(code_points[0] as u32);
if debug {
eprintln!(
"found at 3: [{}], start: {:04X}",
elements
.iter()
.map(ToString::to_string)
.collect::<Vec<_>>()
.join("]["),
code_points[0] as u32
);
}
}
if debug {
eprintln!("add part 1 elements to all");
}
// add elements to final collection
all_elements.extend(elements);
cp_index += idx + 1;
}
if debug {
eprintln!(
"all: [{}]",
all_elements
.iter()
.map(ToString::to_string)
.collect::<Vec<_>>()
.join("][")
);
}
all_elements
}
fn implicit_weight(&self, cp: u32) -> Vec<Element> {
let base;
let mut aaaa = None;
let mut bbbb = 0;
if is_unified_ideograph(cp)
&& ((cp >= 0x4E00 && cp <= 0x9FFF) || (cp >= 0xF900 && cp <= 0xFAFF))
{
base = 0xFB40;
aaaa = Some(base + (cp >> 15));
bbbb = (cp & 0x7FFF) | 0x8000;
} else if is_unified_ideograph(cp)
&& !((cp >= 0x4E00 && cp <= 0x9FFF) || (cp >= 0xF900 && cp <= 0xFAFF))
{
base = 0xFB80;
aaaa = Some(base + (cp >> 15));
bbbb = (cp & 0x7FFF) | 0x8000;
} else {
if let Some((start, _, base)) = IMPLICIT_WEIGHTS
.iter()
.find(|(start, end, _)| cp >= *start && cp <= *end)
{
aaaa = Some(*base);
bbbb = (cp - *start) | 0x8000;
if cp >= 0x18D00 && cp <= 0x18D8F {
bbbb = (cp - 0x17000) | 0x8000;
} else {
bbbb = (cp - *start) | 0x8000;
}
}
if aaaa.is_none() {
base = 0xFBC0;
aaaa = Some(base + (cp >> 15));
bbbb = (cp & 0x7FFF) | 0x8000;
}
}
vec![
Element {
l1: aaaa.map(|x| (x & 0xFFFF) as u16).unwrap_or(0),
l2: 0x0020,
l3: 0x0002,
l4: 0x0000,
variable: false,
},
Element {
l1: (bbbb & 0xFFFF) as u16,
l2: 0x0000,
l3: 0x0000,
l4: 0x0000,
variable: false,
},
]
}
fn sort_key_from_collation_elements(&self, collation_elements: &[Element]) -> Vec<u16> {
let l1 = collation_elements
.iter()
.map(|element| element.l1)
.filter(|x| *x > 0);
let l2 = collation_elements
.iter()
.map(|element| element.l2)
.filter(|x| *x > 0);
let l3 = collation_elements
.iter()
.map(|element| element.l3 as u16)
.filter(|x| *x > 0);
let l4 = collation_elements
.iter()
.map(|element| element.l4)
.filter(|x| *x > 0);
l1.chain(iter::once(0))
.chain(l2)
.chain(iter::once(0))
.chain(l3)
.chain(iter::once(0))
.chain(l4)
.collect()
}
pub fn sort_key<S: AsRef<str>>(&self, input: S) -> Vec<u16> {
let normalized = unf::nfd(input.as_ref()).collect::<Vec<_>>();
let collation_elements = self.collation_elements(&normalized);
self.sort_key_from_collation_elements(&collation_elements)
}
}
pub fn is_unified_ideograph(cp: u32) -> bool {
if cp < 0x3400 || cp > 0x3134a {
return false;
}
match cp {
0x3400..=0x4dbf => true,
0x4e00..=0x9fff => true,
0xfa0e..=0xfa0f => true,
0xfa11 => true,
0xfa13..=0xfa14 => true,
0xfa1f => true,
0xfa21 => true,
0xfa23..=0xfa24 => true,
0xfa27..=0xfa29 => true,
0x20000..=0x2a6df => true,
0x2a700..=0x2b738 => true,
0x2b740..=0x2b81d => true,
0x2b820..=0x2cea1 => true,
0x2ceb0..=0x2ebe0 => true,
0x30000..=0x3134a => true,
_ => false,
}
}
#[cfg(test)]
mod tests {
use crate::fmt;
use super::*;
#[test]
fn test_bug_001() {
let collator = Collator::default();
let fixture = "\u{1abc}\u{334}";
let sort_key = collator.sort_key(fixture);
similar_asserts::assert_eq!("| 004A 0033 | 0002 0002 |", fmt(&sort_key));
let fixture = "\u{1ac1}\u{334}";
let sort_key = collator.sort_key(fixture);
similar_asserts::assert_eq!("| 004A 0033 | 0002 0002 |", fmt(&sort_key));
}
#[test]
fn test_bug_002() {
let collator = Collator::default();
let fixture = "\u{a8}\u{301}\u{334}";
let sort_key = collator.sort_key(fixture);
similar_asserts::assert_eq!(
"04D0 | 0020 004A 0024 | 0002 0002 0002 |",
fmt(&sort_key),
"nfd: {:?}",
unf::nfd(fixture)
.map(|ch| format!("{:04X}", ch as u32))
.collect::<Vec<_>>()
.join(" ")
);
}
#[test]
fn test_bug_003() {
let collator = Collator::default();
let fixture = "\u{18d00}\u{21}";
let sort_key = collator.sort_key(fixture);
similar_asserts::assert_eq!(
"FB00 9D00 0268 | 0020 0020 | 0002 0002 |",
fmt(&sort_key),
"nfd: {:?}",
unf::nfd(fixture)
.map(|ch| format!("{:04X}", ch as u32))
.collect::<Vec<_>>()
.join(" ")
);
}
}

View File

@@ -0,0 +1,16 @@
pub mod collator;
mod table;
mod weights;
pub fn fmt(sort_key: &[u16]) -> String {
use std::borrow::Cow;
sort_key
.iter()
.map(|x| match x {
0 => Cow::Borrowed("|"),
_ => Cow::Owned(format!("{:04X}", x)),
})
.collect::<Vec<_>>()
.join(" ")
}

View File

@@ -0,0 +1,161 @@
use std::fmt::Display;
use ufst::raw::{Fst, Output};
const TABLE: Fst<&'static [u8]> =
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
pub fn lookup(value: &[char]) -> Option<(Vec<Element>, usize)> {
let mut node = TABLE.root();
let mut out = Output::zero();
let mut last_match = None;
'char: for (i, &c) in value.iter().enumerate() {
for b in (c as u32).to_ne_bytes() {
if let Some(trans_index) = node.find_input(b) {
let t = node.transition(trans_index);
node = TABLE.node(t.addr);
out = out.cat(t.out);
if node.is_final() {
last_match = Some((out.cat(node.final_output()).value(), i));
}
} else {
break 'char;
}
}
}
last_match.map(|(data, idx)| {
(
match Value::from_u64(data) {
Value::Entry(element) => vec![element],
Value::Index(idx, len) => {
let start = idx as usize;
let end = start + len as usize;
crate::weights::EXPLICIT_WEIGHTS[start..end]
.iter()
.map(|(l1, l2, l3, l4, variable)| Element {
l1: *l1,
l2: *l2,
l3: *l3,
l4: *l4,
variable: *variable,
})
.collect()
}
},
idx,
)
})
}
#[derive(Clone, Copy, PartialEq, Debug)]
pub struct Element {
pub l1: u16,
pub l2: u16,
pub l3: u8,
pub l4: u16,
pub variable: bool,
}
impl Display for Element {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}{:04X}.{:04X}.{:04X}.{:04X}",
if self.variable { "*" } else { "." },
self.l1,
self.l2,
self.l3,
self.l4
)?;
Ok(())
}
}
#[derive(Clone, Copy, PartialEq, Debug)]
enum Value {
Entry(Element),
Index(u32, u8),
}
impl Value {
fn to_u64(self) -> u64 {
match self {
Self::Entry(element) => {
((element.l4 as u64) << 42)
| ((element.l3 as u64) << 34)
| ((element.l2 as u64) << 18)
| ((element.l1 as u64) << 2)
| (if element.variable { 1 } else { 0 } << 1)
}
Self::Index(idx, len) => ((idx as u64) << 9) | ((len as u64) << 1) | 1,
}
}
fn from_u64(data: u64) -> Self {
if (data & 1) == 0 {
let variable = ((data >> 1) & 1) == 1;
let l1 = ((data >> 2) & 0xFFFF) as u16;
let l2 = ((data >> 18) & 0xFFFF) as u16;
let l3 = ((data >> 34) & 0xFF) as u8;
let l4 = ((data >> 42) & 0xFFFF) as u16;
Self::Entry(Element {
l1,
l2,
l3,
l4,
variable,
})
} else {
let len = ((data >> 1) & 0xFF) as u8;
let idx = ((data >> 9) & 0xFFFFFFFF) as u32;
Self::Index(idx, len)
}
}
}
#[cfg(test)]
mod tests {
use proptest::prelude::*;
use super::*;
fn value_strategy() -> impl Strategy<Value = Value> {
prop_oneof![
(any::<u32>(), any::<u8>()).prop_map(|(idx, len)| Value::Index(idx, len)),
(
any::<u16>(),
any::<u16>(),
any::<u8>(),
any::<u16>(),
any::<bool>()
)
.prop_map(|(l1, l2, l3, l4, variable)| Value::Entry(Element {
l1,
l2,
l3,
l4,
variable
})),
]
}
proptest! {
#[test]
fn proptest_serialize_and_deserialize(a in value_strategy()) {
let data = a.to_u64();
let b = Value::from_u64(data);
prop_assert_eq!(a, b);
}
}
}

View File

@@ -0,0 +1,2 @@
include!(concat!(env!("OUT_DIR"), "/explicit_weights.rs"));
include!(concat!(env!("OUT_DIR"), "/implicit_weights.rs"));

View File

@@ -0,0 +1,91 @@
use std::fs::File;
use std::io::{BufRead, BufReader};
use smol_uca::collator::Collator;
#[test]
fn collation_test_non_ignorable() {
let data = File::open("data/CollationTest_NON_IGNORABLE.txt")
.map(BufReader::new)
.expect("collation test data");
let collator = Collator::default();
let mut prev_sort_key = None;
let mut order_errors = 0;
let mut sort_key_errors = 0;
'line: for (n, line) in data.lines().enumerate() {
let line = line.expect("line");
let line = line.trim_start();
if line.is_empty() || line.starts_with('#') {
continue;
}
let (chars, rest) = line
.split_once(';')
.expect("a semicolon separated test line");
let mut surrogates = false;
let test_string = chars
.trim()
.split(' ')
.map(|x| u32::from_str_radix(x, 16).expect("a valid hex value"))
.map(|x| match char::from_u32(x) {
Some(ch) => ch,
None => {
if (0xD800u32..=0xDFFF).contains(&x) {
surrogates = true;
' '
} else {
panic!("{}", line)
}
}
})
.collect::<String>();
if surrogates {
continue 'line;
}
let expected_sort_key = rest.rsplit(['[', ']']).nth(1).expect("sort key");
let sort_key = collator.sort_key(&test_string);
let fmt_sort_key = smol_uca::fmt(&sort_key);
if let Some(prev_sort_key) = prev_sort_key.take() {
if sort_key < prev_sort_key {
eprintln!(
"Error at line {}: {:?} [{}]",
n + 1,
test_string,
expected_sort_key
);
order_errors += 1;
}
}
prev_sort_key = Some(sort_key);
if fmt_sort_key != expected_sort_key {
eprintln!(
"Error at line {}: {:?} expected: [{}], got: [{}] ({})",
n + 1,
unf::nfd(&test_string).collect::<String>(),
expected_sort_key,
fmt_sort_key,
line
);
sort_key_errors += 1;
}
}
assert_eq!(order_errors, 0);
assert_eq!(sort_key_errors, 0);
}