Rename crates
This commit is contained in:
18
crates/u-sort/Cargo.toml
Normal file
18
crates/u-sort/Cargo.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[package]
|
||||
name = "u-sort"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
parse = { path = "../parse" }
|
||||
u-fst = { path = "../u-fst" }
|
||||
u-norm = { path = "../u-norm" }
|
||||
|
||||
[build-dependencies]
|
||||
bytemuck = "1.9.1"
|
||||
parse = { path = "../parse" }
|
||||
u-fst = { path = "../u-fst" }
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1.0.0"
|
||||
similar-asserts = "1.2.0"
|
||||
16
crates/u-sort/README.md
Normal file
16
crates/u-sort/README.md
Normal file
@@ -0,0 +1,16 @@
|
||||
# Smol UCA
|
||||
|
||||
Implementation of [Unicode Collation Algorithm](https://unicode.org/reports/tr10/).
|
||||
|
||||
## Todo
|
||||
|
||||
- [x] Build fst on build
|
||||
- [x] Switch to ufst
|
||||
- [ ] Add benchmarks
|
||||
|
||||
## See
|
||||
|
||||
- [ziglyph](https://github.com/jecolon/ziglyph/blob/main/src/collator/Collator.zig)
|
||||
- [pyuca](https://github.com/jtauber/pyuca)
|
||||
- [collate](https://github.com/tertsdiepraam/collate)
|
||||
- [UCA Auxiliary Files](http://www.unicode.org/Public/UCA/6.0.0/CollationAuxiliary.html)
|
||||
108
crates/u-sort/build.rs
Normal file
108
crates/u-sort/build.rs
Normal file
@@ -0,0 +1,108 @@
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
use parse::uca::allkeys;
|
||||
use u_fst::raw::Builder;
|
||||
|
||||
fn main() {
|
||||
println!("cargo:rerun-if-changed=data/allkeys.txt");
|
||||
println!("cargo:rerun-if-changed=build.rs");
|
||||
|
||||
let all_keys = {
|
||||
let data = std::fs::read_to_string("data/allkeys.txt").unwrap();
|
||||
|
||||
allkeys::parse(&data)
|
||||
};
|
||||
|
||||
let out_dir = env::var_os("OUT_DIR").unwrap();
|
||||
|
||||
let mut implicit_weights = String::new();
|
||||
|
||||
implicit_weights.push_str(&format!(
|
||||
"pub const IMPLICIT_WEIGHTS: [(u32, u32, u32); {}] = [\n",
|
||||
all_keys.implicit_weights.len(),
|
||||
));
|
||||
|
||||
for implicit_weight in all_keys.implicit_weights {
|
||||
implicit_weights.push_str(&format!(
|
||||
" ({}, {}, {}),\n",
|
||||
implicit_weight.start, implicit_weight.end, implicit_weight.base
|
||||
));
|
||||
}
|
||||
|
||||
implicit_weights.push_str("];\n");
|
||||
|
||||
fs::write(
|
||||
&Path::new(&out_dir).join("implicit_weights.rs"),
|
||||
implicit_weights,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let mut entries = all_keys
|
||||
.entries
|
||||
.into_iter()
|
||||
.map(|entry| {
|
||||
(
|
||||
bytemuck::cast_slice::<u32, u8>(&entry.chars).to_vec(),
|
||||
entry.elements,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
entries.sort_unstable_by(|(a, _), (b, _)| a.cmp(b));
|
||||
|
||||
let mut builder = Builder::memory();
|
||||
let mut overflow = Vec::new();
|
||||
|
||||
for (chars, mut elements) in entries.into_iter() {
|
||||
let value = match elements.len() {
|
||||
1 => {
|
||||
let element = elements.pop().unwrap();
|
||||
|
||||
((element.l4 as u64) << 42)
|
||||
| ((element.l3 as u64) << 34)
|
||||
| ((element.l2 as u64) << 18)
|
||||
| ((element.l1 as u64) << 2)
|
||||
| (if element.variable { 1 } else { 0 } << 1)
|
||||
}
|
||||
2.. => {
|
||||
let idx = overflow.len();
|
||||
let len = elements.len();
|
||||
|
||||
overflow.extend(elements.into_iter());
|
||||
|
||||
((idx as u64) << 9) | ((len as u64) << 1) | 1
|
||||
}
|
||||
_ => panic!("this shouldn't happen!"),
|
||||
};
|
||||
|
||||
builder.insert(chars, value).unwrap();
|
||||
}
|
||||
|
||||
let data = builder.into_fst().into_inner();
|
||||
|
||||
fs::write(&Path::new(&out_dir).join("table.fst"), data).unwrap();
|
||||
|
||||
let mut explicit_weights = String::new();
|
||||
|
||||
explicit_weights.push_str(&format!(
|
||||
"pub const EXPLICIT_WEIGHTS: [(u16, u16, u8, u16, bool); {}] = [\n",
|
||||
overflow.len(),
|
||||
));
|
||||
|
||||
for element in overflow {
|
||||
explicit_weights.push_str(&format!(
|
||||
" ({}, {}, {}, {}, {}),\n",
|
||||
element.l1, element.l2, element.l3, element.l4, element.variable,
|
||||
));
|
||||
}
|
||||
|
||||
explicit_weights.push_str("];\n");
|
||||
|
||||
fs::write(
|
||||
&Path::new(&out_dir).join("explicit_weights.rs"),
|
||||
explicit_weights,
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
211458
crates/u-sort/data/CollationTest_NON_IGNORABLE.txt
Normal file
211458
crates/u-sort/data/CollationTest_NON_IGNORABLE.txt
Normal file
File diff suppressed because it is too large
Load Diff
227503
crates/u-sort/data/CollationTest_SHIFTED.txt
Normal file
227503
crates/u-sort/data/CollationTest_SHIFTED.txt
Normal file
File diff suppressed because it is too large
Load Diff
33925
crates/u-sort/data/allkeys.txt
Normal file
33925
crates/u-sort/data/allkeys.txt
Normal file
File diff suppressed because it is too large
Load Diff
374
crates/u-sort/src/collator.rs
Normal file
374
crates/u-sort/src/collator.rs
Normal file
@@ -0,0 +1,374 @@
|
||||
use std::iter;
|
||||
|
||||
use crate::table::{self, Element};
|
||||
use crate::weights::IMPLICIT_WEIGHTS;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Collator;
|
||||
|
||||
impl Collator {
|
||||
fn collation_elements(&self, normalized: &[char]) -> Vec<Element> {
|
||||
let debug = false;
|
||||
|
||||
let mut all_elements = Vec::new();
|
||||
|
||||
let mut code_points = normalized.to_vec();
|
||||
let mut code_points_len = code_points.len();
|
||||
let mut cp_index = 0;
|
||||
|
||||
if debug {
|
||||
eprintln!(
|
||||
"nfd: {}",
|
||||
normalized
|
||||
.iter()
|
||||
.map(|c| format!("{:04X}", *c as u32))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
);
|
||||
}
|
||||
|
||||
while cp_index < code_points_len {
|
||||
let (mut elements, idx) = table::lookup(&code_points[cp_index..]).unwrap_or_default();
|
||||
|
||||
if debug {
|
||||
eprintln!(
|
||||
"found at 1: [{}], idx: {}, start: {:04X}",
|
||||
elements
|
||||
.iter()
|
||||
.map(ToString::to_string)
|
||||
.collect::<Vec<_>>()
|
||||
.join("]["),
|
||||
idx,
|
||||
code_points[cp_index] as u32
|
||||
);
|
||||
}
|
||||
|
||||
let s = &code_points[0..cp_index + idx + 1];
|
||||
|
||||
// handle non-starters
|
||||
let mut last_class = None;
|
||||
let tail_start = cp_index + idx + 1;
|
||||
let mut tail_index = tail_start;
|
||||
|
||||
// advance to last combining C
|
||||
while tail_index < code_points_len {
|
||||
let combining_class =
|
||||
u_norm::table::lookup(code_points[tail_index]).combining_class();
|
||||
|
||||
if debug {
|
||||
eprintln!(
|
||||
"combining class: {}, start: {:04X}",
|
||||
combining_class, code_points[tail_index] as u32
|
||||
);
|
||||
}
|
||||
|
||||
if combining_class == 0 {
|
||||
if tail_index != tail_start {
|
||||
tail_index -= 1;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
if let Some(last_class) = last_class {
|
||||
if last_class >= combining_class {
|
||||
if tail_index != tail_start {
|
||||
tail_index -= 1;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
last_class = Some(combining_class);
|
||||
tail_index += 1;
|
||||
}
|
||||
|
||||
if tail_index == code_points_len {
|
||||
tail_index -= 1;
|
||||
}
|
||||
|
||||
if tail_index > tail_start {
|
||||
let c = code_points[tail_index];
|
||||
|
||||
let mut new_key = Vec::with_capacity(s.len() + 1);
|
||||
new_key.extend_from_slice(s);
|
||||
new_key.push(c);
|
||||
|
||||
if debug {
|
||||
eprintln!(
|
||||
"new key: {}, s: {}, c: {:04X}",
|
||||
new_key
|
||||
.iter()
|
||||
.map(|c| format!("{:04X}", *c as u32))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" "),
|
||||
s.iter()
|
||||
.map(|c| format!("{:04X}", *c as u32))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" "),
|
||||
c as u32
|
||||
);
|
||||
}
|
||||
|
||||
let (new_elements, new_idx) = table::lookup(&new_key).expect("lookup");
|
||||
|
||||
if debug {
|
||||
eprintln!(
|
||||
"found at 2: [{}], idx: {}, start: {:04X}",
|
||||
new_elements
|
||||
.iter()
|
||||
.map(ToString::to_string)
|
||||
.collect::<Vec<_>>()
|
||||
.join("]["),
|
||||
new_idx,
|
||||
new_key[0] as u32
|
||||
);
|
||||
}
|
||||
|
||||
if new_idx == (new_key.len() - 1) && !new_elements.is_empty() {
|
||||
cp_index = tail_start;
|
||||
|
||||
// splice
|
||||
let mut tmp = Vec::with_capacity(code_points_len - 1);
|
||||
tmp.extend_from_slice(&code_points[0..tail_index]);
|
||||
|
||||
if tail_index + 1 < code_points_len {
|
||||
tmp.extend_from_slice(&code_points[tail_index + 1..]);
|
||||
}
|
||||
|
||||
code_points = tmp;
|
||||
code_points_len = code_points.len();
|
||||
|
||||
if debug {
|
||||
eprintln!("add part 2 elements to all");
|
||||
}
|
||||
|
||||
// add elements to final collection
|
||||
all_elements.extend(new_elements);
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if elements.is_empty() {
|
||||
if debug {
|
||||
eprintln!("no part 1 elements, use implicit weight");
|
||||
}
|
||||
|
||||
elements = self.implicit_weight(code_points[0] as u32);
|
||||
|
||||
if debug {
|
||||
eprintln!(
|
||||
"found at 3: [{}], start: {:04X}",
|
||||
elements
|
||||
.iter()
|
||||
.map(ToString::to_string)
|
||||
.collect::<Vec<_>>()
|
||||
.join("]["),
|
||||
code_points[0] as u32
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if debug {
|
||||
eprintln!("add part 1 elements to all");
|
||||
}
|
||||
|
||||
// add elements to final collection
|
||||
all_elements.extend(elements);
|
||||
|
||||
cp_index += idx + 1;
|
||||
}
|
||||
|
||||
if debug {
|
||||
eprintln!(
|
||||
"all: [{}]",
|
||||
all_elements
|
||||
.iter()
|
||||
.map(ToString::to_string)
|
||||
.collect::<Vec<_>>()
|
||||
.join("][")
|
||||
);
|
||||
}
|
||||
|
||||
all_elements
|
||||
}
|
||||
|
||||
fn implicit_weight(&self, cp: u32) -> Vec<Element> {
|
||||
let base;
|
||||
let mut aaaa = None;
|
||||
let mut bbbb = 0;
|
||||
|
||||
if is_unified_ideograph(cp)
|
||||
&& ((cp >= 0x4E00 && cp <= 0x9FFF) || (cp >= 0xF900 && cp <= 0xFAFF))
|
||||
{
|
||||
base = 0xFB40;
|
||||
aaaa = Some(base + (cp >> 15));
|
||||
bbbb = (cp & 0x7FFF) | 0x8000;
|
||||
} else if is_unified_ideograph(cp)
|
||||
&& !((cp >= 0x4E00 && cp <= 0x9FFF) || (cp >= 0xF900 && cp <= 0xFAFF))
|
||||
{
|
||||
base = 0xFB80;
|
||||
aaaa = Some(base + (cp >> 15));
|
||||
bbbb = (cp & 0x7FFF) | 0x8000;
|
||||
} else {
|
||||
if let Some((start, _, base)) = IMPLICIT_WEIGHTS
|
||||
.iter()
|
||||
.find(|(start, end, _)| cp >= *start && cp <= *end)
|
||||
{
|
||||
aaaa = Some(*base);
|
||||
bbbb = (cp - *start) | 0x8000;
|
||||
|
||||
if cp >= 0x18D00 && cp <= 0x18D8F {
|
||||
bbbb = (cp - 0x17000) | 0x8000;
|
||||
} else {
|
||||
bbbb = (cp - *start) | 0x8000;
|
||||
}
|
||||
}
|
||||
|
||||
if aaaa.is_none() {
|
||||
base = 0xFBC0;
|
||||
aaaa = Some(base + (cp >> 15));
|
||||
bbbb = (cp & 0x7FFF) | 0x8000;
|
||||
}
|
||||
}
|
||||
|
||||
vec![
|
||||
Element {
|
||||
l1: aaaa.map(|x| (x & 0xFFFF) as u16).unwrap_or(0),
|
||||
l2: 0x0020,
|
||||
l3: 0x0002,
|
||||
l4: 0x0000,
|
||||
variable: false,
|
||||
},
|
||||
Element {
|
||||
l1: (bbbb & 0xFFFF) as u16,
|
||||
l2: 0x0000,
|
||||
l3: 0x0000,
|
||||
l4: 0x0000,
|
||||
variable: false,
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
fn sort_key_from_collation_elements(&self, collation_elements: &[Element]) -> Vec<u16> {
|
||||
let l1 = collation_elements
|
||||
.iter()
|
||||
.map(|element| element.l1)
|
||||
.filter(|x| *x > 0);
|
||||
|
||||
let l2 = collation_elements
|
||||
.iter()
|
||||
.map(|element| element.l2)
|
||||
.filter(|x| *x > 0);
|
||||
|
||||
let l3 = collation_elements
|
||||
.iter()
|
||||
.map(|element| element.l3 as u16)
|
||||
.filter(|x| *x > 0);
|
||||
|
||||
let l4 = collation_elements
|
||||
.iter()
|
||||
.map(|element| element.l4)
|
||||
.filter(|x| *x > 0);
|
||||
|
||||
l1.chain(iter::once(0))
|
||||
.chain(l2)
|
||||
.chain(iter::once(0))
|
||||
.chain(l3)
|
||||
.chain(iter::once(0))
|
||||
.chain(l4)
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn sort_key<S: AsRef<str>>(&self, input: S) -> Vec<u16> {
|
||||
let normalized = u_norm::nfd(input.as_ref()).collect::<Vec<_>>();
|
||||
let collation_elements = self.collation_elements(&normalized);
|
||||
|
||||
self.sort_key_from_collation_elements(&collation_elements)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_unified_ideograph(cp: u32) -> bool {
|
||||
if cp < 0x3400 || cp > 0x3134a {
|
||||
return false;
|
||||
}
|
||||
|
||||
match cp {
|
||||
0x3400..=0x4dbf => true,
|
||||
0x4e00..=0x9fff => true,
|
||||
0xfa0e..=0xfa0f => true,
|
||||
0xfa11 => true,
|
||||
0xfa13..=0xfa14 => true,
|
||||
0xfa1f => true,
|
||||
0xfa21 => true,
|
||||
0xfa23..=0xfa24 => true,
|
||||
0xfa27..=0xfa29 => true,
|
||||
0x20000..=0x2a6df => true,
|
||||
0x2a700..=0x2b738 => true,
|
||||
0x2b740..=0x2b81d => true,
|
||||
0x2b820..=0x2cea1 => true,
|
||||
0x2ceb0..=0x2ebe0 => true,
|
||||
0x30000..=0x3134a => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::fmt;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_bug_001() {
|
||||
let collator = Collator::default();
|
||||
|
||||
let fixture = "\u{1abc}\u{334}";
|
||||
let sort_key = collator.sort_key(fixture);
|
||||
|
||||
similar_asserts::assert_eq!("| 004A 0033 | 0002 0002 |", fmt(&sort_key));
|
||||
|
||||
let fixture = "\u{1ac1}\u{334}";
|
||||
let sort_key = collator.sort_key(fixture);
|
||||
|
||||
similar_asserts::assert_eq!("| 004A 0033 | 0002 0002 |", fmt(&sort_key));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bug_002() {
|
||||
let collator = Collator::default();
|
||||
|
||||
let fixture = "\u{a8}\u{301}\u{334}";
|
||||
let sort_key = collator.sort_key(fixture);
|
||||
|
||||
similar_asserts::assert_eq!(
|
||||
"04D0 | 0020 004A 0024 | 0002 0002 0002 |",
|
||||
fmt(&sort_key),
|
||||
"nfd: {:?}",
|
||||
u_norm::nfd(fixture)
|
||||
.map(|ch| format!("{:04X}", ch as u32))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bug_003() {
|
||||
let collator = Collator::default();
|
||||
|
||||
let fixture = "\u{18d00}\u{21}";
|
||||
let sort_key = collator.sort_key(fixture);
|
||||
|
||||
similar_asserts::assert_eq!(
|
||||
"FB00 9D00 0268 | 0020 0020 | 0002 0002 |",
|
||||
fmt(&sort_key),
|
||||
"nfd: {:?}",
|
||||
u_norm::nfd(fixture)
|
||||
.map(|ch| format!("{:04X}", ch as u32))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
);
|
||||
}
|
||||
}
|
||||
16
crates/u-sort/src/lib.rs
Normal file
16
crates/u-sort/src/lib.rs
Normal file
@@ -0,0 +1,16 @@
|
||||
pub mod collator;
|
||||
mod table;
|
||||
mod weights;
|
||||
|
||||
pub fn fmt(sort_key: &[u16]) -> String {
|
||||
use std::borrow::Cow;
|
||||
|
||||
sort_key
|
||||
.iter()
|
||||
.map(|x| match x {
|
||||
0 => Cow::Borrowed("|"),
|
||||
_ => Cow::Owned(format!("{:04X}", x)),
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
}
|
||||
52
crates/u-sort/src/main.rs
Normal file
52
crates/u-sort/src/main.rs
Normal file
@@ -0,0 +1,52 @@
|
||||
use std::cmp;
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
|
||||
use parse::uca::allkeys;
|
||||
|
||||
fn main() {
|
||||
let allkeys = {
|
||||
let data = fs::read_to_string("data/allkeys.txt").unwrap();
|
||||
|
||||
allkeys::parse(&data)
|
||||
};
|
||||
|
||||
let mut l1 = 0;
|
||||
let mut l2 = 0;
|
||||
let mut l3 = 0;
|
||||
let mut l4 = 0;
|
||||
|
||||
let mut count = HashMap::new();
|
||||
|
||||
for entry in allkeys.entries {
|
||||
count
|
||||
.entry(entry.elements.len())
|
||||
.and_modify(|x| *x += 1)
|
||||
.or_insert(1);
|
||||
|
||||
for element in entry.elements {
|
||||
l1 = cmp::max(l1, element.l1);
|
||||
l2 = cmp::max(l2, element.l2);
|
||||
l3 = cmp::max(l3, element.l3);
|
||||
l4 = cmp::max(l4, element.l4);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
l1 = 16 bits
|
||||
l2 = 9 bits
|
||||
l3 = 5 bits
|
||||
l4 = 0 bits
|
||||
variable = 1 bit
|
||||
|
||||
total = 31 bits
|
||||
*/
|
||||
|
||||
println!("l1: {} - {} bit(s)", l1, u16::BITS - l1.leading_zeros());
|
||||
println!("l2: {} - {} bit(s)", l2, u16::BITS - l2.leading_zeros());
|
||||
println!("l3: {} - {} bit(s)", l3, u8::BITS - l3.leading_zeros());
|
||||
println!("l4: {} - {} bit(s)", l4, u16::BITS - l4.leading_zeros());
|
||||
println!("variable: 1 bit(s)");
|
||||
println!();
|
||||
println!("{:#?}", count);
|
||||
}
|
||||
161
crates/u-sort/src/table.rs
Normal file
161
crates/u-sort/src/table.rs
Normal file
@@ -0,0 +1,161 @@
|
||||
use std::fmt::Display;
|
||||
|
||||
use u_fst::raw::{Fst, Output};
|
||||
|
||||
const TABLE: Fst<&'static [u8]> =
|
||||
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
|
||||
|
||||
pub fn lookup(value: &[char]) -> Option<(Vec<Element>, usize)> {
|
||||
let mut node = TABLE.root();
|
||||
let mut out = Output::zero();
|
||||
|
||||
let mut last_match = None;
|
||||
|
||||
'char: for (i, &c) in value.iter().enumerate() {
|
||||
for b in (c as u32).to_ne_bytes() {
|
||||
if let Some(trans_index) = node.find_input(b) {
|
||||
let t = node.transition(trans_index);
|
||||
|
||||
node = TABLE.node(t.addr);
|
||||
out = out.cat(t.out);
|
||||
|
||||
if node.is_final() {
|
||||
last_match = Some((out.cat(node.final_output()).value(), i));
|
||||
}
|
||||
} else {
|
||||
break 'char;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
last_match.map(|(data, idx)| {
|
||||
(
|
||||
match Value::from_u64(data) {
|
||||
Value::Entry(element) => vec![element],
|
||||
Value::Index(idx, len) => {
|
||||
let start = idx as usize;
|
||||
let end = start + len as usize;
|
||||
|
||||
crate::weights::EXPLICIT_WEIGHTS[start..end]
|
||||
.iter()
|
||||
.map(|(l1, l2, l3, l4, variable)| Element {
|
||||
l1: *l1,
|
||||
l2: *l2,
|
||||
l3: *l3,
|
||||
l4: *l4,
|
||||
variable: *variable,
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
},
|
||||
idx,
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Debug)]
|
||||
pub struct Element {
|
||||
pub l1: u16,
|
||||
pub l2: u16,
|
||||
pub l3: u8,
|
||||
pub l4: u16,
|
||||
pub variable: bool,
|
||||
}
|
||||
|
||||
impl Display for Element {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{}{:04X}.{:04X}.{:04X}.{:04X}",
|
||||
if self.variable { "*" } else { "." },
|
||||
self.l1,
|
||||
self.l2,
|
||||
self.l3,
|
||||
self.l4
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Debug)]
|
||||
enum Value {
|
||||
Entry(Element),
|
||||
Index(u32, u8),
|
||||
}
|
||||
|
||||
impl Value {
|
||||
fn to_u64(self) -> u64 {
|
||||
match self {
|
||||
Self::Entry(element) => {
|
||||
((element.l4 as u64) << 42)
|
||||
| ((element.l3 as u64) << 34)
|
||||
| ((element.l2 as u64) << 18)
|
||||
| ((element.l1 as u64) << 2)
|
||||
| (if element.variable { 1 } else { 0 } << 1)
|
||||
}
|
||||
Self::Index(idx, len) => ((idx as u64) << 9) | ((len as u64) << 1) | 1,
|
||||
}
|
||||
}
|
||||
|
||||
fn from_u64(data: u64) -> Self {
|
||||
if (data & 1) == 0 {
|
||||
let variable = ((data >> 1) & 1) == 1;
|
||||
|
||||
let l1 = ((data >> 2) & 0xFFFF) as u16;
|
||||
let l2 = ((data >> 18) & 0xFFFF) as u16;
|
||||
let l3 = ((data >> 34) & 0xFF) as u8;
|
||||
let l4 = ((data >> 42) & 0xFFFF) as u16;
|
||||
|
||||
Self::Entry(Element {
|
||||
l1,
|
||||
l2,
|
||||
l3,
|
||||
l4,
|
||||
variable,
|
||||
})
|
||||
} else {
|
||||
let len = ((data >> 1) & 0xFF) as u8;
|
||||
let idx = ((data >> 9) & 0xFFFFFFFF) as u32;
|
||||
|
||||
Self::Index(idx, len)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use proptest::prelude::*;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn value_strategy() -> impl Strategy<Value = Value> {
|
||||
prop_oneof![
|
||||
(any::<u32>(), any::<u8>()).prop_map(|(idx, len)| Value::Index(idx, len)),
|
||||
(
|
||||
any::<u16>(),
|
||||
any::<u16>(),
|
||||
any::<u8>(),
|
||||
any::<u16>(),
|
||||
any::<bool>()
|
||||
)
|
||||
.prop_map(|(l1, l2, l3, l4, variable)| Value::Entry(Element {
|
||||
l1,
|
||||
l2,
|
||||
l3,
|
||||
l4,
|
||||
variable
|
||||
})),
|
||||
]
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn proptest_serialize_and_deserialize(a in value_strategy()) {
|
||||
let data = a.to_u64();
|
||||
let b = Value::from_u64(data);
|
||||
|
||||
prop_assert_eq!(a, b);
|
||||
}
|
||||
}
|
||||
}
|
||||
2
crates/u-sort/src/weights.rs
Normal file
2
crates/u-sort/src/weights.rs
Normal file
@@ -0,0 +1,2 @@
|
||||
include!(concat!(env!("OUT_DIR"), "/explicit_weights.rs"));
|
||||
include!(concat!(env!("OUT_DIR"), "/implicit_weights.rs"));
|
||||
91
crates/u-sort/tests/collation_test.rs
Normal file
91
crates/u-sort/tests/collation_test.rs
Normal file
@@ -0,0 +1,91 @@
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
|
||||
use u_sort::collator::Collator;
|
||||
|
||||
#[test]
|
||||
fn collation_test_non_ignorable() {
|
||||
let data = File::open("data/CollationTest_NON_IGNORABLE.txt")
|
||||
.map(BufReader::new)
|
||||
.expect("collation test data");
|
||||
|
||||
let collator = Collator::default();
|
||||
|
||||
let mut prev_sort_key = None;
|
||||
|
||||
let mut order_errors = 0;
|
||||
let mut sort_key_errors = 0;
|
||||
|
||||
'line: for (n, line) in data.lines().enumerate() {
|
||||
let line = line.expect("line");
|
||||
|
||||
let line = line.trim_start();
|
||||
|
||||
if line.is_empty() || line.starts_with('#') {
|
||||
continue;
|
||||
}
|
||||
|
||||
let (chars, rest) = line
|
||||
.split_once(';')
|
||||
.expect("a semicolon separated test line");
|
||||
|
||||
let mut surrogates = false;
|
||||
let test_string = chars
|
||||
.trim()
|
||||
.split(' ')
|
||||
.map(|x| u32::from_str_radix(x, 16).expect("a valid hex value"))
|
||||
.map(|x| match char::from_u32(x) {
|
||||
Some(ch) => ch,
|
||||
None => {
|
||||
if (0xD800u32..=0xDFFF).contains(&x) {
|
||||
surrogates = true;
|
||||
|
||||
' '
|
||||
} else {
|
||||
panic!("{}", line)
|
||||
}
|
||||
}
|
||||
})
|
||||
.collect::<String>();
|
||||
|
||||
if surrogates {
|
||||
continue 'line;
|
||||
}
|
||||
|
||||
let expected_sort_key = rest.rsplit(['[', ']']).nth(1).expect("sort key");
|
||||
|
||||
let sort_key = collator.sort_key(&test_string);
|
||||
let fmt_sort_key = u_sort::fmt(&sort_key);
|
||||
|
||||
if let Some(prev_sort_key) = prev_sort_key.take() {
|
||||
if sort_key < prev_sort_key {
|
||||
eprintln!(
|
||||
"Error at line {}: {:?} [{}]",
|
||||
n + 1,
|
||||
test_string,
|
||||
expected_sort_key
|
||||
);
|
||||
|
||||
order_errors += 1;
|
||||
}
|
||||
}
|
||||
|
||||
prev_sort_key = Some(sort_key);
|
||||
|
||||
if fmt_sort_key != expected_sort_key {
|
||||
eprintln!(
|
||||
"Error at line {}: {:?} expected: [{}], got: [{}] ({})",
|
||||
n + 1,
|
||||
u_norm::nfd(&test_string).collect::<String>(),
|
||||
expected_sort_key,
|
||||
fmt_sort_key,
|
||||
line
|
||||
);
|
||||
|
||||
sort_key_errors += 1;
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(order_errors, 0);
|
||||
assert_eq!(sort_key_errors, 0);
|
||||
}
|
||||
Reference in New Issue
Block a user