Better packing for u-sort

This commit is contained in:
2022-05-27 07:29:28 +02:00
parent cb74158d5d
commit e9935c8b54
7 changed files with 367 additions and 241 deletions

View File

@@ -1,6 +1,7 @@
use std::iter;
use crate::table::{self, Element};
use crate::element::Element;
use crate::table;
use crate::weights::IMPLICIT_WEIGHTS;
#[derive(Default)]
@@ -8,7 +9,7 @@ pub struct Collator;
impl Collator {
fn collation_elements(&self, normalized: &[char]) -> Vec<Element> {
let debug = false;
let debug = true;
let mut all_elements = Vec::new();
@@ -28,7 +29,8 @@ impl Collator {
}
while cp_index < code_points_len {
let (mut elements, idx) = table::lookup(&code_points[cp_index..]).unwrap_or_default();
let (elements, idx) = table::lookup(&code_points[cp_index..]).unwrap_or_default();
let mut elements = elements.collect::<Vec<_>>();
if debug {
eprintln!(
@@ -112,6 +114,7 @@ impl Collator {
}
let (new_elements, new_idx) = table::lookup(&new_key).expect("lookup");
let new_elements = new_elements.collect::<Vec<_>>();
if debug {
eprintln!(
@@ -235,42 +238,36 @@ impl Collator {
}
vec![
Element {
l1: aaaa.map(|x| (x & 0xFFFF) as u16).unwrap_or(0),
l2: 0x0020,
l3: 0x0002,
l4: 0x0000,
variable: false,
},
Element {
l1: (bbbb & 0xFFFF) as u16,
l2: 0x0000,
l3: 0x0000,
l4: 0x0000,
variable: false,
},
Element::new(
aaaa.map(|x| (x & 0xFFFF) as u16).unwrap_or(0),
0x0020,
0x0002,
0x0000,
false,
),
Element::new((bbbb & 0xFFFF) as u16, 0x0000, 0x0000, 0x0000, false),
]
}
fn sort_key_from_collation_elements(&self, collation_elements: &[Element]) -> Vec<u16> {
let l1 = collation_elements
.iter()
.map(|element| element.l1)
.map(|element| element.l1())
.filter(|x| *x > 0);
let l2 = collation_elements
.iter()
.map(|element| element.l2)
.map(|element| element.l2())
.filter(|x| *x > 0);
let l3 = collation_elements
.iter()
.map(|element| element.l3 as u16)
.map(|element| element.l3() as u16)
.filter(|x| *x > 0);
let l4 = collation_elements
.iter()
.map(|element| element.l4)
.map(|element| element.l4())
.filter(|x| *x > 0);
l1.chain(iter::once(0))
@@ -371,4 +368,24 @@ mod tests {
.join(" ")
);
}
#[test]
fn test_bug_004() {
// 31B3 0062; # (ㆳ) BOPOMOFO LETTER INNN [4593 208F | 0020 0119 0020 | 0016 0016 0002 |]
let collator = Collator::default();
let fixture = "\u{31b3}\u{62}";
let sort_key = collator.sort_key(fixture);
similar_asserts::assert_eq!(
"4593 208F | 0020 0119 0020 | 0016 0016 0002 |",
fmt(&sort_key),
"nfd: {:?}",
u_norm::nfd(fixture)
.map(|ch| format!("{:04X}", ch as u32))
.collect::<Vec<_>>()
.join(" ")
);
}
}

View File

@@ -0,0 +1,189 @@
use std::fmt::Display;
const L1_BITS: u32 = 16;
const L2_BITS: u32 = 9;
const L3_BITS: u32 = 5;
const VARIABLE_BITS: u32 = 1;
#[derive(Clone, Copy, Default)]
pub(crate) struct Elements {
pub(crate) buf: &'static [u64],
pub(crate) cur: u64,
}
impl Elements {
pub(crate) fn is_empty(&self) -> bool {
self.cur == 0 && self.buf.is_empty()
}
}
impl Iterator for Elements {
type Item = Element;
#[inline(always)]
fn next(&mut self) -> Option<Self::Item> {
if self.cur == 0 {
if self.buf.is_empty() {
return None;
}
self.cur = self.buf[0];
self.buf = &self.buf[1..];
}
let element = Element(((self.cur >> 1) & 0x7FFFFFFF) as u32);
self.cur >>= 32;
Some(element)
}
}
#[derive(Clone, Copy, PartialEq, Debug)]
pub(crate) struct Element(u32);
impl Element {
// l1 = 16 bits
// l2 = 9 bits
// l3 = 5 bits
// l4 = 0 bits
// variable = 1 bit
//
// total = 31 bits
pub(crate) fn new(l1: u16, l2: u16, l3: u8, l4: u16, variable: bool) -> Self {
debug_assert!(u16::BITS - l1.leading_zeros() <= L1_BITS);
debug_assert!(u16::BITS - l2.leading_zeros() <= L2_BITS);
debug_assert!(u8::BITS - l3.leading_zeros() <= L3_BITS);
debug_assert!(l4 == 0);
Self(
variable as u32
| (l1 as u32) << VARIABLE_BITS
| (l2 as u32) << (VARIABLE_BITS + L1_BITS)
| (l3 as u32) << (VARIABLE_BITS + L1_BITS + L2_BITS),
)
}
#[inline(always)]
pub(crate) fn l1(&self) -> u16 {
((self.0 >> VARIABLE_BITS) & 0xFFFF) as _
}
#[inline(always)]
pub(crate) fn l2(&self) -> u16 {
((self.0 >> (VARIABLE_BITS + L1_BITS)) & 0x1FF) as _
}
#[inline(always)]
pub(crate) fn l3(&self) -> u8 {
((self.0 >> (VARIABLE_BITS + L1_BITS + L2_BITS)) & 0x1F) as _
}
#[inline(always)]
pub(crate) fn l4(&self) -> u16 {
0
}
#[inline(always)]
fn is_variable(&self) -> bool {
(self.0 & 0x1) == 1
}
pub(crate) fn as_u32(&self) -> u32 {
self.0
}
}
impl Display for Element {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}{:04X}.{:04X}.{:04X}.{:04X}",
if self.is_variable() { "*" } else { "." },
self.l1(),
self.l2(),
self.l3(),
self.l4()
)
}
}
#[derive(Clone, Copy, PartialEq, Debug)]
pub(crate) struct Range(pub(crate) u64);
impl Range {
pub(crate) fn new(start: u32, len: u8) -> Self {
Self((len as u64) | (start as u64) << 8)
}
#[inline(always)]
pub(crate) fn start(&self) -> usize {
((self.0 >> 8) & 0xFFFFFFFF) as usize
}
#[inline(always)]
pub(crate) fn end(&self) -> usize {
self.start() + (self.0 & 0xFF) as usize
}
pub(crate) fn as_u64(&self) -> u64 {
self.0
}
}
pub(crate) fn pack(elements: &[Element]) -> u64 {
assert!(elements.len() <= 2);
let mut data = 0;
for e in elements.iter().rev() {
if data != 0 {
data <<= 32;
}
data |= (e.as_u32() as u64) << 1 | 1;
}
data
}
#[cfg(test)]
mod tests {
use proptest::prelude::*;
use super::*;
#[test]
fn test_elements() {
let a = Element::new(0x4593, 0x0020, 0x0016, 0x0000, false);
let b = Element::new(0x0000, 0x0119, 0x0016, 0x0000, false);
let e = Elements {
buf: &[],
cur: pack(&[a, b]),
};
assert_eq!(e.collect::<Vec<_>>(), vec![a, b]);
}
proptest! {
#[test]
fn proptest_element(l1 in any::<u16>(), l2 in (0u16..256), l3 in (0u8..32), variable in any::<bool>()) {
let r = Element::new(l1, l2, l3, 0, variable);
prop_assert_eq!(r.l1(), l1);
prop_assert_eq!(r.l2(), l2);
prop_assert_eq!(r.l3(), l3);
prop_assert_eq!(r.l4(), 0);
prop_assert_eq!(r.is_variable(), variable);
}
#[test]
fn proptest_range(start in any::<u32>(), len in any::<u8>()) {
let r = Range::new(start, len);
prop_assert_eq!(r.start(), start as usize);
prop_assert_eq!(r.end(), start as usize + len as usize);
}
}
}

View File

@@ -1,4 +1,5 @@
pub mod collator;
mod element;
mod table;
mod weights;

View File

@@ -1,11 +1,12 @@
use std::fmt::Display;
use u_fst::raw::{Fst, Output};
use crate::element::{Elements, Range};
use crate::weights::EXPLICIT_WEIGHTS;
const TABLE: Fst<&'static [u8]> =
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
pub fn lookup(value: &[char]) -> Option<(Vec<Element>, usize)> {
pub(crate) fn lookup(value: &[char]) -> Option<(Elements, usize)> {
let mut node = TABLE.root();
let mut out = Output::zero();
@@ -30,132 +31,19 @@ pub fn lookup(value: &[char]) -> Option<(Vec<Element>, usize)> {
last_match.map(|(data, idx)| {
(
match Value::from_u64(data) {
Value::Entry(element) => vec![element],
Value::Index(idx, len) => {
let start = idx as usize;
let end = start + len as usize;
crate::weights::EXPLICIT_WEIGHTS[start..end]
.iter()
.map(|(l1, l2, l3, l4, variable)| Element {
l1: *l1,
l2: *l2,
l3: *l3,
l4: *l4,
variable: *variable,
})
.collect()
if data & 0x1 == 1 {
Elements {
buf: &[],
cur: data,
}
} else {
let r = Range(data >> 1);
let (first, buf) = EXPLICIT_WEIGHTS[r.start()..r.end()].split_at(1);
Elements { buf, cur: first[0] }
},
idx,
)
})
}
#[derive(Clone, Copy, PartialEq, Debug)]
pub struct Element {
pub l1: u16,
pub l2: u16,
pub l3: u8,
pub l4: u16,
pub variable: bool,
}
impl Display for Element {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}{:04X}.{:04X}.{:04X}.{:04X}",
if self.variable { "*" } else { "." },
self.l1,
self.l2,
self.l3,
self.l4
)?;
Ok(())
}
}
#[derive(Clone, Copy, PartialEq, Debug)]
enum Value {
Entry(Element),
Index(u32, u8),
}
impl Value {
fn to_u64(self) -> u64 {
match self {
Self::Entry(element) => {
((element.l4 as u64) << 42)
| ((element.l3 as u64) << 34)
| ((element.l2 as u64) << 18)
| ((element.l1 as u64) << 2)
| (if element.variable { 1 } else { 0 } << 1)
}
Self::Index(idx, len) => ((idx as u64) << 9) | ((len as u64) << 1) | 1,
}
}
fn from_u64(data: u64) -> Self {
if (data & 1) == 0 {
let variable = ((data >> 1) & 1) == 1;
let l1 = ((data >> 2) & 0xFFFF) as u16;
let l2 = ((data >> 18) & 0xFFFF) as u16;
let l3 = ((data >> 34) & 0xFF) as u8;
let l4 = ((data >> 42) & 0xFFFF) as u16;
Self::Entry(Element {
l1,
l2,
l3,
l4,
variable,
})
} else {
let len = ((data >> 1) & 0xFF) as u8;
let idx = ((data >> 9) & 0xFFFFFFFF) as u32;
Self::Index(idx, len)
}
}
}
#[cfg(test)]
mod tests {
use proptest::prelude::*;
use super::*;
fn value_strategy() -> impl Strategy<Value = Value> {
prop_oneof![
(any::<u32>(), any::<u8>()).prop_map(|(idx, len)| Value::Index(idx, len)),
(
any::<u16>(),
any::<u16>(),
any::<u8>(),
any::<u16>(),
any::<bool>()
)
.prop_map(|(l1, l2, l3, l4, variable)| Value::Entry(Element {
l1,
l2,
l3,
l4,
variable
})),
]
}
proptest! {
#[test]
fn proptest_serialize_and_deserialize(a in value_strategy()) {
let data = a.to_u64();
let b = Value::from_u64(data);
prop_assert_eq!(a, b);
}
}
}