Rename crates

This commit is contained in:
2022-05-24 20:58:27 +02:00
parent 8a8baffba8
commit 9f44196e6c
51 changed files with 2531 additions and 54 deletions

View File

@@ -0,0 +1,374 @@
use std::iter;
use crate::table::{self, Element};
use crate::weights::IMPLICIT_WEIGHTS;
#[derive(Default)]
pub struct Collator;
impl Collator {
fn collation_elements(&self, normalized: &[char]) -> Vec<Element> {
let debug = false;
let mut all_elements = Vec::new();
let mut code_points = normalized.to_vec();
let mut code_points_len = code_points.len();
let mut cp_index = 0;
if debug {
eprintln!(
"nfd: {}",
normalized
.iter()
.map(|c| format!("{:04X}", *c as u32))
.collect::<Vec<_>>()
.join(" ")
);
}
while cp_index < code_points_len {
let (mut elements, idx) = table::lookup(&code_points[cp_index..]).unwrap_or_default();
if debug {
eprintln!(
"found at 1: [{}], idx: {}, start: {:04X}",
elements
.iter()
.map(ToString::to_string)
.collect::<Vec<_>>()
.join("]["),
idx,
code_points[cp_index] as u32
);
}
let s = &code_points[0..cp_index + idx + 1];
// handle non-starters
let mut last_class = None;
let tail_start = cp_index + idx + 1;
let mut tail_index = tail_start;
// advance to last combining C
while tail_index < code_points_len {
let combining_class =
u_norm::table::lookup(code_points[tail_index]).combining_class();
if debug {
eprintln!(
"combining class: {}, start: {:04X}",
combining_class, code_points[tail_index] as u32
);
}
if combining_class == 0 {
if tail_index != tail_start {
tail_index -= 1;
}
break;
}
if let Some(last_class) = last_class {
if last_class >= combining_class {
if tail_index != tail_start {
tail_index -= 1;
}
break;
}
}
last_class = Some(combining_class);
tail_index += 1;
}
if tail_index == code_points_len {
tail_index -= 1;
}
if tail_index > tail_start {
let c = code_points[tail_index];
let mut new_key = Vec::with_capacity(s.len() + 1);
new_key.extend_from_slice(s);
new_key.push(c);
if debug {
eprintln!(
"new key: {}, s: {}, c: {:04X}",
new_key
.iter()
.map(|c| format!("{:04X}", *c as u32))
.collect::<Vec<_>>()
.join(" "),
s.iter()
.map(|c| format!("{:04X}", *c as u32))
.collect::<Vec<_>>()
.join(" "),
c as u32
);
}
let (new_elements, new_idx) = table::lookup(&new_key).expect("lookup");
if debug {
eprintln!(
"found at 2: [{}], idx: {}, start: {:04X}",
new_elements
.iter()
.map(ToString::to_string)
.collect::<Vec<_>>()
.join("]["),
new_idx,
new_key[0] as u32
);
}
if new_idx == (new_key.len() - 1) && !new_elements.is_empty() {
cp_index = tail_start;
// splice
let mut tmp = Vec::with_capacity(code_points_len - 1);
tmp.extend_from_slice(&code_points[0..tail_index]);
if tail_index + 1 < code_points_len {
tmp.extend_from_slice(&code_points[tail_index + 1..]);
}
code_points = tmp;
code_points_len = code_points.len();
if debug {
eprintln!("add part 2 elements to all");
}
// add elements to final collection
all_elements.extend(new_elements);
continue;
}
}
if elements.is_empty() {
if debug {
eprintln!("no part 1 elements, use implicit weight");
}
elements = self.implicit_weight(code_points[0] as u32);
if debug {
eprintln!(
"found at 3: [{}], start: {:04X}",
elements
.iter()
.map(ToString::to_string)
.collect::<Vec<_>>()
.join("]["),
code_points[0] as u32
);
}
}
if debug {
eprintln!("add part 1 elements to all");
}
// add elements to final collection
all_elements.extend(elements);
cp_index += idx + 1;
}
if debug {
eprintln!(
"all: [{}]",
all_elements
.iter()
.map(ToString::to_string)
.collect::<Vec<_>>()
.join("][")
);
}
all_elements
}
fn implicit_weight(&self, cp: u32) -> Vec<Element> {
let base;
let mut aaaa = None;
let mut bbbb = 0;
if is_unified_ideograph(cp)
&& ((cp >= 0x4E00 && cp <= 0x9FFF) || (cp >= 0xF900 && cp <= 0xFAFF))
{
base = 0xFB40;
aaaa = Some(base + (cp >> 15));
bbbb = (cp & 0x7FFF) | 0x8000;
} else if is_unified_ideograph(cp)
&& !((cp >= 0x4E00 && cp <= 0x9FFF) || (cp >= 0xF900 && cp <= 0xFAFF))
{
base = 0xFB80;
aaaa = Some(base + (cp >> 15));
bbbb = (cp & 0x7FFF) | 0x8000;
} else {
if let Some((start, _, base)) = IMPLICIT_WEIGHTS
.iter()
.find(|(start, end, _)| cp >= *start && cp <= *end)
{
aaaa = Some(*base);
bbbb = (cp - *start) | 0x8000;
if cp >= 0x18D00 && cp <= 0x18D8F {
bbbb = (cp - 0x17000) | 0x8000;
} else {
bbbb = (cp - *start) | 0x8000;
}
}
if aaaa.is_none() {
base = 0xFBC0;
aaaa = Some(base + (cp >> 15));
bbbb = (cp & 0x7FFF) | 0x8000;
}
}
vec![
Element {
l1: aaaa.map(|x| (x & 0xFFFF) as u16).unwrap_or(0),
l2: 0x0020,
l3: 0x0002,
l4: 0x0000,
variable: false,
},
Element {
l1: (bbbb & 0xFFFF) as u16,
l2: 0x0000,
l3: 0x0000,
l4: 0x0000,
variable: false,
},
]
}
fn sort_key_from_collation_elements(&self, collation_elements: &[Element]) -> Vec<u16> {
let l1 = collation_elements
.iter()
.map(|element| element.l1)
.filter(|x| *x > 0);
let l2 = collation_elements
.iter()
.map(|element| element.l2)
.filter(|x| *x > 0);
let l3 = collation_elements
.iter()
.map(|element| element.l3 as u16)
.filter(|x| *x > 0);
let l4 = collation_elements
.iter()
.map(|element| element.l4)
.filter(|x| *x > 0);
l1.chain(iter::once(0))
.chain(l2)
.chain(iter::once(0))
.chain(l3)
.chain(iter::once(0))
.chain(l4)
.collect()
}
pub fn sort_key<S: AsRef<str>>(&self, input: S) -> Vec<u16> {
let normalized = u_norm::nfd(input.as_ref()).collect::<Vec<_>>();
let collation_elements = self.collation_elements(&normalized);
self.sort_key_from_collation_elements(&collation_elements)
}
}
pub fn is_unified_ideograph(cp: u32) -> bool {
if cp < 0x3400 || cp > 0x3134a {
return false;
}
match cp {
0x3400..=0x4dbf => true,
0x4e00..=0x9fff => true,
0xfa0e..=0xfa0f => true,
0xfa11 => true,
0xfa13..=0xfa14 => true,
0xfa1f => true,
0xfa21 => true,
0xfa23..=0xfa24 => true,
0xfa27..=0xfa29 => true,
0x20000..=0x2a6df => true,
0x2a700..=0x2b738 => true,
0x2b740..=0x2b81d => true,
0x2b820..=0x2cea1 => true,
0x2ceb0..=0x2ebe0 => true,
0x30000..=0x3134a => true,
_ => false,
}
}
#[cfg(test)]
mod tests {
use crate::fmt;
use super::*;
#[test]
fn test_bug_001() {
let collator = Collator::default();
let fixture = "\u{1abc}\u{334}";
let sort_key = collator.sort_key(fixture);
similar_asserts::assert_eq!("| 004A 0033 | 0002 0002 |", fmt(&sort_key));
let fixture = "\u{1ac1}\u{334}";
let sort_key = collator.sort_key(fixture);
similar_asserts::assert_eq!("| 004A 0033 | 0002 0002 |", fmt(&sort_key));
}
#[test]
fn test_bug_002() {
let collator = Collator::default();
let fixture = "\u{a8}\u{301}\u{334}";
let sort_key = collator.sort_key(fixture);
similar_asserts::assert_eq!(
"04D0 | 0020 004A 0024 | 0002 0002 0002 |",
fmt(&sort_key),
"nfd: {:?}",
u_norm::nfd(fixture)
.map(|ch| format!("{:04X}", ch as u32))
.collect::<Vec<_>>()
.join(" ")
);
}
#[test]
fn test_bug_003() {
let collator = Collator::default();
let fixture = "\u{18d00}\u{21}";
let sort_key = collator.sort_key(fixture);
similar_asserts::assert_eq!(
"FB00 9D00 0268 | 0020 0020 | 0002 0002 |",
fmt(&sort_key),
"nfd: {:?}",
u_norm::nfd(fixture)
.map(|ch| format!("{:04X}", ch as u32))
.collect::<Vec<_>>()
.join(" ")
);
}
}

16
crates/u-sort/src/lib.rs Normal file
View File

@@ -0,0 +1,16 @@
pub mod collator;
mod table;
mod weights;
pub fn fmt(sort_key: &[u16]) -> String {
use std::borrow::Cow;
sort_key
.iter()
.map(|x| match x {
0 => Cow::Borrowed("|"),
_ => Cow::Owned(format!("{:04X}", x)),
})
.collect::<Vec<_>>()
.join(" ")
}

52
crates/u-sort/src/main.rs Normal file
View File

@@ -0,0 +1,52 @@
use std::cmp;
use std::collections::HashMap;
use std::fs;
use parse::uca::allkeys;
fn main() {
let allkeys = {
let data = fs::read_to_string("data/allkeys.txt").unwrap();
allkeys::parse(&data)
};
let mut l1 = 0;
let mut l2 = 0;
let mut l3 = 0;
let mut l4 = 0;
let mut count = HashMap::new();
for entry in allkeys.entries {
count
.entry(entry.elements.len())
.and_modify(|x| *x += 1)
.or_insert(1);
for element in entry.elements {
l1 = cmp::max(l1, element.l1);
l2 = cmp::max(l2, element.l2);
l3 = cmp::max(l3, element.l3);
l4 = cmp::max(l4, element.l4);
}
}
/*
l1 = 16 bits
l2 = 9 bits
l3 = 5 bits
l4 = 0 bits
variable = 1 bit
total = 31 bits
*/
println!("l1: {} - {} bit(s)", l1, u16::BITS - l1.leading_zeros());
println!("l2: {} - {} bit(s)", l2, u16::BITS - l2.leading_zeros());
println!("l3: {} - {} bit(s)", l3, u8::BITS - l3.leading_zeros());
println!("l4: {} - {} bit(s)", l4, u16::BITS - l4.leading_zeros());
println!("variable: 1 bit(s)");
println!();
println!("{:#?}", count);
}

161
crates/u-sort/src/table.rs Normal file
View File

@@ -0,0 +1,161 @@
use std::fmt::Display;
use u_fst::raw::{Fst, Output};
const TABLE: Fst<&'static [u8]> =
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
pub fn lookup(value: &[char]) -> Option<(Vec<Element>, usize)> {
let mut node = TABLE.root();
let mut out = Output::zero();
let mut last_match = None;
'char: for (i, &c) in value.iter().enumerate() {
for b in (c as u32).to_ne_bytes() {
if let Some(trans_index) = node.find_input(b) {
let t = node.transition(trans_index);
node = TABLE.node(t.addr);
out = out.cat(t.out);
if node.is_final() {
last_match = Some((out.cat(node.final_output()).value(), i));
}
} else {
break 'char;
}
}
}
last_match.map(|(data, idx)| {
(
match Value::from_u64(data) {
Value::Entry(element) => vec![element],
Value::Index(idx, len) => {
let start = idx as usize;
let end = start + len as usize;
crate::weights::EXPLICIT_WEIGHTS[start..end]
.iter()
.map(|(l1, l2, l3, l4, variable)| Element {
l1: *l1,
l2: *l2,
l3: *l3,
l4: *l4,
variable: *variable,
})
.collect()
}
},
idx,
)
})
}
#[derive(Clone, Copy, PartialEq, Debug)]
pub struct Element {
pub l1: u16,
pub l2: u16,
pub l3: u8,
pub l4: u16,
pub variable: bool,
}
impl Display for Element {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}{:04X}.{:04X}.{:04X}.{:04X}",
if self.variable { "*" } else { "." },
self.l1,
self.l2,
self.l3,
self.l4
)?;
Ok(())
}
}
#[derive(Clone, Copy, PartialEq, Debug)]
enum Value {
Entry(Element),
Index(u32, u8),
}
impl Value {
fn to_u64(self) -> u64 {
match self {
Self::Entry(element) => {
((element.l4 as u64) << 42)
| ((element.l3 as u64) << 34)
| ((element.l2 as u64) << 18)
| ((element.l1 as u64) << 2)
| (if element.variable { 1 } else { 0 } << 1)
}
Self::Index(idx, len) => ((idx as u64) << 9) | ((len as u64) << 1) | 1,
}
}
fn from_u64(data: u64) -> Self {
if (data & 1) == 0 {
let variable = ((data >> 1) & 1) == 1;
let l1 = ((data >> 2) & 0xFFFF) as u16;
let l2 = ((data >> 18) & 0xFFFF) as u16;
let l3 = ((data >> 34) & 0xFF) as u8;
let l4 = ((data >> 42) & 0xFFFF) as u16;
Self::Entry(Element {
l1,
l2,
l3,
l4,
variable,
})
} else {
let len = ((data >> 1) & 0xFF) as u8;
let idx = ((data >> 9) & 0xFFFFFFFF) as u32;
Self::Index(idx, len)
}
}
}
#[cfg(test)]
mod tests {
use proptest::prelude::*;
use super::*;
fn value_strategy() -> impl Strategy<Value = Value> {
prop_oneof![
(any::<u32>(), any::<u8>()).prop_map(|(idx, len)| Value::Index(idx, len)),
(
any::<u16>(),
any::<u16>(),
any::<u8>(),
any::<u16>(),
any::<bool>()
)
.prop_map(|(l1, l2, l3, l4, variable)| Value::Entry(Element {
l1,
l2,
l3,
l4,
variable
})),
]
}
proptest! {
#[test]
fn proptest_serialize_and_deserialize(a in value_strategy()) {
let data = a.to_u64();
let b = Value::from_u64(data);
prop_assert_eq!(a, b);
}
}
}

View File

@@ -0,0 +1,2 @@
include!(concat!(env!("OUT_DIR"), "/explicit_weights.rs"));
include!(concat!(env!("OUT_DIR"), "/implicit_weights.rs"));