diff --git a/crates/u-norm/src/buffer.rs b/crates/u-norm/src/buffer.rs new file mode 100644 index 0000000..8cd55f2 --- /dev/null +++ b/crates/u-norm/src/buffer.rs @@ -0,0 +1,84 @@ +use std::ops::Range; + +use tinyvec::TinyVec; + +use crate::table; + +pub(crate) struct Buffer { + buffer: TinyVec<[(u8, char); 4]>, + ready: Range, +} + +impl Buffer { + pub(crate) fn new() -> Self { + Self { + buffer: TinyVec::new(), + ready: 0..0, + } + } + + #[inline(always)] + pub(crate) fn is_ready(&self) -> bool { + self.ready.end != 0 + } + + #[inline(always)] + pub(crate) fn is_empty(&self) -> bool { + self.buffer.is_empty() + } + + #[inline(always)] + pub(crate) fn finish(&mut self) { + self.sort_pending(); + self.ready.end = self.buffer.len(); + } + + #[inline(always)] + pub(crate) fn push(&mut self, ch: char, class: u8) { + if class == 0 { + self.sort_pending(); + + self.buffer.push((class, ch)); + self.ready.end = self.buffer.len(); + } else { + self.buffer.push((class, ch)); + } + } + + #[inline(always)] + pub(crate) fn push_back(&mut self, ch: char) { + self.push(ch, table::lookup(ch).combining_class()); + } + + #[inline(always)] + pub(crate) fn pop(&mut self) -> char { + let (_, ch) = self.buffer[self.ready.start]; + + let next = self.ready.start + 1; + + if next == self.ready.end { + self.reset(); + } else { + self.ready.start = next; + } + + ch + } + + #[inline(always)] + fn sort_pending(&mut self) { + self.buffer[self.ready.end..].sort_by_key(|k| k.0); + } + + #[inline(always)] + fn reset(&mut self) { + let pending = self.buffer.len() - self.ready.end; + + for i in 0..pending { + self.buffer[i] = self.buffer[i + self.ready.end]; + } + + self.buffer.truncate(pending); + self.ready = 0..0; + } +} diff --git a/crates/u-norm/src/lib.rs b/crates/u-norm/src/lib.rs index d9ea790..1b8b756 100644 --- a/crates/u-norm/src/lib.rs +++ b/crates/u-norm/src/lib.rs @@ -1,11 +1,11 @@ use std::iter::Fuse; -use std::ops::Range; use std::str::Chars; -use tinyvec::TinyVec; - +mod buffer; pub mod table; +use buffer::Buffer; + pub fn nfd(s: &str) -> Decompositions> { Decompositions { iter: s.chars().fuse(), @@ -13,65 +13,6 @@ pub fn nfd(s: &str) -> Decompositions> { } } -struct Buffer { - buffer: TinyVec<[(u8, char); 4]>, - ready: Range, -} - -impl Buffer { - fn new() -> Self { - Self { - buffer: TinyVec::new(), - ready: 0..0, - } - } - - #[inline(always)] - fn push(&mut self, ch: char, class: u8) { - if class == 0 { - self.sort_pending(); - - self.buffer.push((class, ch)); - self.ready.end = self.buffer.len(); - } else { - self.buffer.push((class, ch)); - } - } - - #[inline(always)] - fn push_back(&mut self, ch: char) { - self.push(ch, table::lookup(ch).combining_class()); - } - - #[inline(always)] - fn sort_pending(&mut self) { - self.buffer[self.ready.end..].sort_by_key(|k| k.0); - } - - #[inline(always)] - fn reset(&mut self) { - let pending = self.buffer.len() - self.ready.end; - - for i in 0..pending { - self.buffer[i] = self.buffer[i + self.ready.end]; - } - - self.buffer.truncate(pending); - self.ready = 0..0; - } - - #[inline(always)] - fn increment_next_ready(&mut self) { - let next = self.ready.start + 1; - - if next == self.ready.end { - self.reset(); - } else { - self.ready.start = next; - } - } -} - pub struct Decompositions { iter: Fuse, buffer: Buffer, @@ -82,17 +23,16 @@ impl> Iterator for Decompositions { #[inline(always)] fn next(&mut self) -> Option { - while self.buffer.ready.end == 0 { + while !self.buffer.is_ready() { match self.iter.next() { Some(ch) => { decompose(ch, &mut self.buffer); } None => { - if self.buffer.buffer.is_empty() { + if self.buffer.is_empty() { return None; } else { - self.buffer.sort_pending(); - self.buffer.ready.end = self.buffer.buffer.len(); + self.buffer.finish(); break; } @@ -100,11 +40,7 @@ impl> Iterator for Decompositions { } } - let (_, ch) = self.buffer.buffer[self.buffer.ready.start]; - - self.buffer.increment_next_ready(); - - Some(ch) + Some(self.buffer.pop()) } fn size_hint(&self) -> (usize, Option) { diff --git a/crates/u-sort/build.rs b/crates/u-sort/build.rs index 44b3c35..9e58699 100644 --- a/crates/u-sort/build.rs +++ b/crates/u-sort/build.rs @@ -5,6 +5,11 @@ use std::path::Path; use parse::uca::allkeys; use u_fst::raw::Builder; +#[path = "src/element.rs"] +mod element; + +use element::{Element, Range}; + fn main() { println!("cargo:rerun-if-changed=data/allkeys.txt"); println!("cargo:rerun-if-changed=build.rs"); @@ -55,26 +60,35 @@ fn main() { let mut builder = Builder::memory(); let mut overflow = Vec::new(); - for (chars, mut elements) in entries.into_iter() { - let value = match elements.len() { - 1 => { - let element = elements.pop().unwrap(); + for (chars, elements) in entries.into_iter() { + let value = if elements.len() <= 2 { + let chunks = elements + .iter() + .map(|e| Element::new(e.l1, e.l2, e.l3, e.l4, e.variable)) + .collect::>(); - ((element.l4 as u64) << 42) - | ((element.l3 as u64) << 34) - | ((element.l2 as u64) << 18) - | ((element.l1 as u64) << 2) - | (if element.variable { 1 } else { 0 } << 1) - } - 2.. => { - let idx = overflow.len(); - let len = elements.len(); + element::pack(&chunks) + } else { + let elements = elements + .chunks(2) + .map(|chunks| { + let chunks = chunks + .iter() + .map(|e| Element::new(e.l1, e.l2, e.l3, e.l4, e.variable)) + .collect::>(); - overflow.extend(elements.into_iter()); + element::pack(&chunks) + }) + .collect::>(); - ((idx as u64) << 9) | ((len as u64) << 1) | 1 - } - _ => panic!("this shouldn't happen!"), + let start = overflow.len() as u32; + let len = elements.len() as u8; + + overflow.extend(elements.into_iter()); + + let data = Range::new(start, len).as_u64(); + + data << 1 }; builder.insert(chars, value).unwrap(); @@ -87,15 +101,12 @@ fn main() { let mut explicit_weights = String::new(); explicit_weights.push_str(&format!( - "pub const EXPLICIT_WEIGHTS: [(u16, u16, u8, u16, bool); {}] = [\n", + "pub const EXPLICIT_WEIGHTS: [u64; {}] = [\n", overflow.len(), )); - for element in overflow { - explicit_weights.push_str(&format!( - " ({}, {}, {}, {}, {}),\n", - element.l1, element.l2, element.l3, element.l4, element.variable, - )); + for elements in overflow { + explicit_weights.push_str(&format!(" {},\n", elements)); } explicit_weights.push_str("];\n"); diff --git a/crates/u-sort/src/collator.rs b/crates/u-sort/src/collator.rs index 7d174c4..a9fe9d5 100644 --- a/crates/u-sort/src/collator.rs +++ b/crates/u-sort/src/collator.rs @@ -1,6 +1,7 @@ use std::iter; -use crate::table::{self, Element}; +use crate::element::Element; +use crate::table; use crate::weights::IMPLICIT_WEIGHTS; #[derive(Default)] @@ -8,7 +9,7 @@ pub struct Collator; impl Collator { fn collation_elements(&self, normalized: &[char]) -> Vec { - let debug = false; + let debug = true; let mut all_elements = Vec::new(); @@ -28,7 +29,8 @@ impl Collator { } while cp_index < code_points_len { - let (mut elements, idx) = table::lookup(&code_points[cp_index..]).unwrap_or_default(); + let (elements, idx) = table::lookup(&code_points[cp_index..]).unwrap_or_default(); + let mut elements = elements.collect::>(); if debug { eprintln!( @@ -112,6 +114,7 @@ impl Collator { } let (new_elements, new_idx) = table::lookup(&new_key).expect("lookup"); + let new_elements = new_elements.collect::>(); if debug { eprintln!( @@ -235,42 +238,36 @@ impl Collator { } vec![ - Element { - l1: aaaa.map(|x| (x & 0xFFFF) as u16).unwrap_or(0), - l2: 0x0020, - l3: 0x0002, - l4: 0x0000, - variable: false, - }, - Element { - l1: (bbbb & 0xFFFF) as u16, - l2: 0x0000, - l3: 0x0000, - l4: 0x0000, - variable: false, - }, + Element::new( + aaaa.map(|x| (x & 0xFFFF) as u16).unwrap_or(0), + 0x0020, + 0x0002, + 0x0000, + false, + ), + Element::new((bbbb & 0xFFFF) as u16, 0x0000, 0x0000, 0x0000, false), ] } fn sort_key_from_collation_elements(&self, collation_elements: &[Element]) -> Vec { let l1 = collation_elements .iter() - .map(|element| element.l1) + .map(|element| element.l1()) .filter(|x| *x > 0); let l2 = collation_elements .iter() - .map(|element| element.l2) + .map(|element| element.l2()) .filter(|x| *x > 0); let l3 = collation_elements .iter() - .map(|element| element.l3 as u16) + .map(|element| element.l3() as u16) .filter(|x| *x > 0); let l4 = collation_elements .iter() - .map(|element| element.l4) + .map(|element| element.l4()) .filter(|x| *x > 0); l1.chain(iter::once(0)) @@ -371,4 +368,24 @@ mod tests { .join(" ") ); } + + #[test] + fn test_bug_004() { + // 31B3 0062; # (ㆳ) BOPOMOFO LETTER INNN [4593 208F | 0020 0119 0020 | 0016 0016 0002 |] + + let collator = Collator::default(); + + let fixture = "\u{31b3}\u{62}"; + let sort_key = collator.sort_key(fixture); + + similar_asserts::assert_eq!( + "4593 208F | 0020 0119 0020 | 0016 0016 0002 |", + fmt(&sort_key), + "nfd: {:?}", + u_norm::nfd(fixture) + .map(|ch| format!("{:04X}", ch as u32)) + .collect::>() + .join(" ") + ); + } } diff --git a/crates/u-sort/src/element.rs b/crates/u-sort/src/element.rs new file mode 100644 index 0000000..80c831e --- /dev/null +++ b/crates/u-sort/src/element.rs @@ -0,0 +1,189 @@ +use std::fmt::Display; + +const L1_BITS: u32 = 16; +const L2_BITS: u32 = 9; +const L3_BITS: u32 = 5; +const VARIABLE_BITS: u32 = 1; + +#[derive(Clone, Copy, Default)] +pub(crate) struct Elements { + pub(crate) buf: &'static [u64], + pub(crate) cur: u64, +} + +impl Elements { + pub(crate) fn is_empty(&self) -> bool { + self.cur == 0 && self.buf.is_empty() + } +} + +impl Iterator for Elements { + type Item = Element; + + #[inline(always)] + fn next(&mut self) -> Option { + if self.cur == 0 { + if self.buf.is_empty() { + return None; + } + + self.cur = self.buf[0]; + self.buf = &self.buf[1..]; + } + + let element = Element(((self.cur >> 1) & 0x7FFFFFFF) as u32); + + self.cur >>= 32; + + Some(element) + } +} + +#[derive(Clone, Copy, PartialEq, Debug)] +pub(crate) struct Element(u32); + +impl Element { + // l1 = 16 bits + // l2 = 9 bits + // l3 = 5 bits + // l4 = 0 bits + // variable = 1 bit + // + // total = 31 bits + pub(crate) fn new(l1: u16, l2: u16, l3: u8, l4: u16, variable: bool) -> Self { + debug_assert!(u16::BITS - l1.leading_zeros() <= L1_BITS); + debug_assert!(u16::BITS - l2.leading_zeros() <= L2_BITS); + debug_assert!(u8::BITS - l3.leading_zeros() <= L3_BITS); + debug_assert!(l4 == 0); + + Self( + variable as u32 + | (l1 as u32) << VARIABLE_BITS + | (l2 as u32) << (VARIABLE_BITS + L1_BITS) + | (l3 as u32) << (VARIABLE_BITS + L1_BITS + L2_BITS), + ) + } + + #[inline(always)] + pub(crate) fn l1(&self) -> u16 { + ((self.0 >> VARIABLE_BITS) & 0xFFFF) as _ + } + + #[inline(always)] + pub(crate) fn l2(&self) -> u16 { + ((self.0 >> (VARIABLE_BITS + L1_BITS)) & 0x1FF) as _ + } + + #[inline(always)] + pub(crate) fn l3(&self) -> u8 { + ((self.0 >> (VARIABLE_BITS + L1_BITS + L2_BITS)) & 0x1F) as _ + } + + #[inline(always)] + pub(crate) fn l4(&self) -> u16 { + 0 + } + + #[inline(always)] + fn is_variable(&self) -> bool { + (self.0 & 0x1) == 1 + } + + pub(crate) fn as_u32(&self) -> u32 { + self.0 + } +} + +impl Display for Element { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}{:04X}.{:04X}.{:04X}.{:04X}", + if self.is_variable() { "*" } else { "." }, + self.l1(), + self.l2(), + self.l3(), + self.l4() + ) + } +} + +#[derive(Clone, Copy, PartialEq, Debug)] +pub(crate) struct Range(pub(crate) u64); + +impl Range { + pub(crate) fn new(start: u32, len: u8) -> Self { + Self((len as u64) | (start as u64) << 8) + } + + #[inline(always)] + pub(crate) fn start(&self) -> usize { + ((self.0 >> 8) & 0xFFFFFFFF) as usize + } + + #[inline(always)] + pub(crate) fn end(&self) -> usize { + self.start() + (self.0 & 0xFF) as usize + } + + pub(crate) fn as_u64(&self) -> u64 { + self.0 + } +} + +pub(crate) fn pack(elements: &[Element]) -> u64 { + assert!(elements.len() <= 2); + + let mut data = 0; + + for e in elements.iter().rev() { + if data != 0 { + data <<= 32; + } + + data |= (e.as_u32() as u64) << 1 | 1; + } + + data +} + +#[cfg(test)] +mod tests { + use proptest::prelude::*; + + use super::*; + + #[test] + fn test_elements() { + let a = Element::new(0x4593, 0x0020, 0x0016, 0x0000, false); + let b = Element::new(0x0000, 0x0119, 0x0016, 0x0000, false); + + let e = Elements { + buf: &[], + cur: pack(&[a, b]), + }; + + assert_eq!(e.collect::>(), vec![a, b]); + } + + proptest! { + #[test] + fn proptest_element(l1 in any::(), l2 in (0u16..256), l3 in (0u8..32), variable in any::()) { + let r = Element::new(l1, l2, l3, 0, variable); + + prop_assert_eq!(r.l1(), l1); + prop_assert_eq!(r.l2(), l2); + prop_assert_eq!(r.l3(), l3); + prop_assert_eq!(r.l4(), 0); + prop_assert_eq!(r.is_variable(), variable); + } + + #[test] + fn proptest_range(start in any::(), len in any::()) { + let r = Range::new(start, len); + + prop_assert_eq!(r.start(), start as usize); + prop_assert_eq!(r.end(), start as usize + len as usize); + } + } +} diff --git a/crates/u-sort/src/lib.rs b/crates/u-sort/src/lib.rs index 6d9e79b..d929dfc 100644 --- a/crates/u-sort/src/lib.rs +++ b/crates/u-sort/src/lib.rs @@ -1,4 +1,5 @@ pub mod collator; +mod element; mod table; mod weights; diff --git a/crates/u-sort/src/table.rs b/crates/u-sort/src/table.rs index 53d69e4..e3b7636 100644 --- a/crates/u-sort/src/table.rs +++ b/crates/u-sort/src/table.rs @@ -1,11 +1,12 @@ -use std::fmt::Display; - use u_fst::raw::{Fst, Output}; +use crate::element::{Elements, Range}; +use crate::weights::EXPLICIT_WEIGHTS; + const TABLE: Fst<&'static [u8]> = Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst"))); -pub fn lookup(value: &[char]) -> Option<(Vec, usize)> { +pub(crate) fn lookup(value: &[char]) -> Option<(Elements, usize)> { let mut node = TABLE.root(); let mut out = Output::zero(); @@ -30,132 +31,19 @@ pub fn lookup(value: &[char]) -> Option<(Vec, usize)> { last_match.map(|(data, idx)| { ( - match Value::from_u64(data) { - Value::Entry(element) => vec![element], - Value::Index(idx, len) => { - let start = idx as usize; - let end = start + len as usize; - - crate::weights::EXPLICIT_WEIGHTS[start..end] - .iter() - .map(|(l1, l2, l3, l4, variable)| Element { - l1: *l1, - l2: *l2, - l3: *l3, - l4: *l4, - variable: *variable, - }) - .collect() + if data & 0x1 == 1 { + Elements { + buf: &[], + cur: data, } + } else { + let r = Range(data >> 1); + + let (first, buf) = EXPLICIT_WEIGHTS[r.start()..r.end()].split_at(1); + + Elements { buf, cur: first[0] } }, idx, ) }) } - -#[derive(Clone, Copy, PartialEq, Debug)] -pub struct Element { - pub l1: u16, - pub l2: u16, - pub l3: u8, - pub l4: u16, - pub variable: bool, -} - -impl Display for Element { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{}{:04X}.{:04X}.{:04X}.{:04X}", - if self.variable { "*" } else { "." }, - self.l1, - self.l2, - self.l3, - self.l4 - )?; - - Ok(()) - } -} - -#[derive(Clone, Copy, PartialEq, Debug)] -enum Value { - Entry(Element), - Index(u32, u8), -} - -impl Value { - fn to_u64(self) -> u64 { - match self { - Self::Entry(element) => { - ((element.l4 as u64) << 42) - | ((element.l3 as u64) << 34) - | ((element.l2 as u64) << 18) - | ((element.l1 as u64) << 2) - | (if element.variable { 1 } else { 0 } << 1) - } - Self::Index(idx, len) => ((idx as u64) << 9) | ((len as u64) << 1) | 1, - } - } - - fn from_u64(data: u64) -> Self { - if (data & 1) == 0 { - let variable = ((data >> 1) & 1) == 1; - - let l1 = ((data >> 2) & 0xFFFF) as u16; - let l2 = ((data >> 18) & 0xFFFF) as u16; - let l3 = ((data >> 34) & 0xFF) as u8; - let l4 = ((data >> 42) & 0xFFFF) as u16; - - Self::Entry(Element { - l1, - l2, - l3, - l4, - variable, - }) - } else { - let len = ((data >> 1) & 0xFF) as u8; - let idx = ((data >> 9) & 0xFFFFFFFF) as u32; - - Self::Index(idx, len) - } - } -} - -#[cfg(test)] -mod tests { - use proptest::prelude::*; - - use super::*; - - fn value_strategy() -> impl Strategy { - prop_oneof![ - (any::(), any::()).prop_map(|(idx, len)| Value::Index(idx, len)), - ( - any::(), - any::(), - any::(), - any::(), - any::() - ) - .prop_map(|(l1, l2, l3, l4, variable)| Value::Entry(Element { - l1, - l2, - l3, - l4, - variable - })), - ] - } - - proptest! { - #[test] - fn proptest_serialize_and_deserialize(a in value_strategy()) { - let data = a.to_u64(); - let b = Value::from_u64(data); - - prop_assert_eq!(a, b); - } - } -}