Better packing for u-sort

This commit is contained in:
2022-05-27 07:29:28 +02:00
parent cb74158d5d
commit e9935c8b54
7 changed files with 367 additions and 241 deletions

View File

@@ -0,0 +1,84 @@
use std::ops::Range;
use tinyvec::TinyVec;
use crate::table;
pub(crate) struct Buffer {
buffer: TinyVec<[(u8, char); 4]>,
ready: Range<usize>,
}
impl Buffer {
pub(crate) fn new() -> Self {
Self {
buffer: TinyVec::new(),
ready: 0..0,
}
}
#[inline(always)]
pub(crate) fn is_ready(&self) -> bool {
self.ready.end != 0
}
#[inline(always)]
pub(crate) fn is_empty(&self) -> bool {
self.buffer.is_empty()
}
#[inline(always)]
pub(crate) fn finish(&mut self) {
self.sort_pending();
self.ready.end = self.buffer.len();
}
#[inline(always)]
pub(crate) fn push(&mut self, ch: char, class: u8) {
if class == 0 {
self.sort_pending();
self.buffer.push((class, ch));
self.ready.end = self.buffer.len();
} else {
self.buffer.push((class, ch));
}
}
#[inline(always)]
pub(crate) fn push_back(&mut self, ch: char) {
self.push(ch, table::lookup(ch).combining_class());
}
#[inline(always)]
pub(crate) fn pop(&mut self) -> char {
let (_, ch) = self.buffer[self.ready.start];
let next = self.ready.start + 1;
if next == self.ready.end {
self.reset();
} else {
self.ready.start = next;
}
ch
}
#[inline(always)]
fn sort_pending(&mut self) {
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
}
#[inline(always)]
fn reset(&mut self) {
let pending = self.buffer.len() - self.ready.end;
for i in 0..pending {
self.buffer[i] = self.buffer[i + self.ready.end];
}
self.buffer.truncate(pending);
self.ready = 0..0;
}
}

View File

@@ -1,11 +1,11 @@
use std::iter::Fuse; use std::iter::Fuse;
use std::ops::Range;
use std::str::Chars; use std::str::Chars;
use tinyvec::TinyVec; mod buffer;
pub mod table; pub mod table;
use buffer::Buffer;
pub fn nfd(s: &str) -> Decompositions<Chars<'_>> { pub fn nfd(s: &str) -> Decompositions<Chars<'_>> {
Decompositions { Decompositions {
iter: s.chars().fuse(), iter: s.chars().fuse(),
@@ -13,65 +13,6 @@ pub fn nfd(s: &str) -> Decompositions<Chars<'_>> {
} }
} }
struct Buffer {
buffer: TinyVec<[(u8, char); 4]>,
ready: Range<usize>,
}
impl Buffer {
fn new() -> Self {
Self {
buffer: TinyVec::new(),
ready: 0..0,
}
}
#[inline(always)]
fn push(&mut self, ch: char, class: u8) {
if class == 0 {
self.sort_pending();
self.buffer.push((class, ch));
self.ready.end = self.buffer.len();
} else {
self.buffer.push((class, ch));
}
}
#[inline(always)]
fn push_back(&mut self, ch: char) {
self.push(ch, table::lookup(ch).combining_class());
}
#[inline(always)]
fn sort_pending(&mut self) {
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
}
#[inline(always)]
fn reset(&mut self) {
let pending = self.buffer.len() - self.ready.end;
for i in 0..pending {
self.buffer[i] = self.buffer[i + self.ready.end];
}
self.buffer.truncate(pending);
self.ready = 0..0;
}
#[inline(always)]
fn increment_next_ready(&mut self) {
let next = self.ready.start + 1;
if next == self.ready.end {
self.reset();
} else {
self.ready.start = next;
}
}
}
pub struct Decompositions<I> { pub struct Decompositions<I> {
iter: Fuse<I>, iter: Fuse<I>,
buffer: Buffer, buffer: Buffer,
@@ -82,17 +23,16 @@ impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
#[inline(always)] #[inline(always)]
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
while self.buffer.ready.end == 0 { while !self.buffer.is_ready() {
match self.iter.next() { match self.iter.next() {
Some(ch) => { Some(ch) => {
decompose(ch, &mut self.buffer); decompose(ch, &mut self.buffer);
} }
None => { None => {
if self.buffer.buffer.is_empty() { if self.buffer.is_empty() {
return None; return None;
} else { } else {
self.buffer.sort_pending(); self.buffer.finish();
self.buffer.ready.end = self.buffer.buffer.len();
break; break;
} }
@@ -100,11 +40,7 @@ impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
} }
} }
let (_, ch) = self.buffer.buffer[self.buffer.ready.start]; Some(self.buffer.pop())
self.buffer.increment_next_ready();
Some(ch)
} }
fn size_hint(&self) -> (usize, Option<usize>) { fn size_hint(&self) -> (usize, Option<usize>) {

View File

@@ -5,6 +5,11 @@ use std::path::Path;
use parse::uca::allkeys; use parse::uca::allkeys;
use u_fst::raw::Builder; use u_fst::raw::Builder;
#[path = "src/element.rs"]
mod element;
use element::{Element, Range};
fn main() { fn main() {
println!("cargo:rerun-if-changed=data/allkeys.txt"); println!("cargo:rerun-if-changed=data/allkeys.txt");
println!("cargo:rerun-if-changed=build.rs"); println!("cargo:rerun-if-changed=build.rs");
@@ -55,26 +60,35 @@ fn main() {
let mut builder = Builder::memory(); let mut builder = Builder::memory();
let mut overflow = Vec::new(); let mut overflow = Vec::new();
for (chars, mut elements) in entries.into_iter() { for (chars, elements) in entries.into_iter() {
let value = match elements.len() { let value = if elements.len() <= 2 {
1 => { let chunks = elements
let element = elements.pop().unwrap(); .iter()
.map(|e| Element::new(e.l1, e.l2, e.l3, e.l4, e.variable))
.collect::<Vec<_>>();
((element.l4 as u64) << 42) element::pack(&chunks)
| ((element.l3 as u64) << 34) } else {
| ((element.l2 as u64) << 18) let elements = elements
| ((element.l1 as u64) << 2) .chunks(2)
| (if element.variable { 1 } else { 0 } << 1) .map(|chunks| {
} let chunks = chunks
2.. => { .iter()
let idx = overflow.len(); .map(|e| Element::new(e.l1, e.l2, e.l3, e.l4, e.variable))
let len = elements.len(); .collect::<Vec<_>>();
overflow.extend(elements.into_iter()); element::pack(&chunks)
})
.collect::<Vec<_>>();
((idx as u64) << 9) | ((len as u64) << 1) | 1 let start = overflow.len() as u32;
} let len = elements.len() as u8;
_ => panic!("this shouldn't happen!"),
overflow.extend(elements.into_iter());
let data = Range::new(start, len).as_u64();
data << 1
}; };
builder.insert(chars, value).unwrap(); builder.insert(chars, value).unwrap();
@@ -87,15 +101,12 @@ fn main() {
let mut explicit_weights = String::new(); let mut explicit_weights = String::new();
explicit_weights.push_str(&format!( explicit_weights.push_str(&format!(
"pub const EXPLICIT_WEIGHTS: [(u16, u16, u8, u16, bool); {}] = [\n", "pub const EXPLICIT_WEIGHTS: [u64; {}] = [\n",
overflow.len(), overflow.len(),
)); ));
for element in overflow { for elements in overflow {
explicit_weights.push_str(&format!( explicit_weights.push_str(&format!(" {},\n", elements));
" ({}, {}, {}, {}, {}),\n",
element.l1, element.l2, element.l3, element.l4, element.variable,
));
} }
explicit_weights.push_str("];\n"); explicit_weights.push_str("];\n");

View File

@@ -1,6 +1,7 @@
use std::iter; use std::iter;
use crate::table::{self, Element}; use crate::element::Element;
use crate::table;
use crate::weights::IMPLICIT_WEIGHTS; use crate::weights::IMPLICIT_WEIGHTS;
#[derive(Default)] #[derive(Default)]
@@ -8,7 +9,7 @@ pub struct Collator;
impl Collator { impl Collator {
fn collation_elements(&self, normalized: &[char]) -> Vec<Element> { fn collation_elements(&self, normalized: &[char]) -> Vec<Element> {
let debug = false; let debug = true;
let mut all_elements = Vec::new(); let mut all_elements = Vec::new();
@@ -28,7 +29,8 @@ impl Collator {
} }
while cp_index < code_points_len { while cp_index < code_points_len {
let (mut elements, idx) = table::lookup(&code_points[cp_index..]).unwrap_or_default(); let (elements, idx) = table::lookup(&code_points[cp_index..]).unwrap_or_default();
let mut elements = elements.collect::<Vec<_>>();
if debug { if debug {
eprintln!( eprintln!(
@@ -112,6 +114,7 @@ impl Collator {
} }
let (new_elements, new_idx) = table::lookup(&new_key).expect("lookup"); let (new_elements, new_idx) = table::lookup(&new_key).expect("lookup");
let new_elements = new_elements.collect::<Vec<_>>();
if debug { if debug {
eprintln!( eprintln!(
@@ -235,42 +238,36 @@ impl Collator {
} }
vec![ vec![
Element { Element::new(
l1: aaaa.map(|x| (x & 0xFFFF) as u16).unwrap_or(0), aaaa.map(|x| (x & 0xFFFF) as u16).unwrap_or(0),
l2: 0x0020, 0x0020,
l3: 0x0002, 0x0002,
l4: 0x0000, 0x0000,
variable: false, false,
}, ),
Element { Element::new((bbbb & 0xFFFF) as u16, 0x0000, 0x0000, 0x0000, false),
l1: (bbbb & 0xFFFF) as u16,
l2: 0x0000,
l3: 0x0000,
l4: 0x0000,
variable: false,
},
] ]
} }
fn sort_key_from_collation_elements(&self, collation_elements: &[Element]) -> Vec<u16> { fn sort_key_from_collation_elements(&self, collation_elements: &[Element]) -> Vec<u16> {
let l1 = collation_elements let l1 = collation_elements
.iter() .iter()
.map(|element| element.l1) .map(|element| element.l1())
.filter(|x| *x > 0); .filter(|x| *x > 0);
let l2 = collation_elements let l2 = collation_elements
.iter() .iter()
.map(|element| element.l2) .map(|element| element.l2())
.filter(|x| *x > 0); .filter(|x| *x > 0);
let l3 = collation_elements let l3 = collation_elements
.iter() .iter()
.map(|element| element.l3 as u16) .map(|element| element.l3() as u16)
.filter(|x| *x > 0); .filter(|x| *x > 0);
let l4 = collation_elements let l4 = collation_elements
.iter() .iter()
.map(|element| element.l4) .map(|element| element.l4())
.filter(|x| *x > 0); .filter(|x| *x > 0);
l1.chain(iter::once(0)) l1.chain(iter::once(0))
@@ -371,4 +368,24 @@ mod tests {
.join(" ") .join(" ")
); );
} }
#[test]
fn test_bug_004() {
// 31B3 0062; # (ㆳ) BOPOMOFO LETTER INNN [4593 208F | 0020 0119 0020 | 0016 0016 0002 |]
let collator = Collator::default();
let fixture = "\u{31b3}\u{62}";
let sort_key = collator.sort_key(fixture);
similar_asserts::assert_eq!(
"4593 208F | 0020 0119 0020 | 0016 0016 0002 |",
fmt(&sort_key),
"nfd: {:?}",
u_norm::nfd(fixture)
.map(|ch| format!("{:04X}", ch as u32))
.collect::<Vec<_>>()
.join(" ")
);
}
} }

View File

@@ -0,0 +1,189 @@
use std::fmt::Display;
const L1_BITS: u32 = 16;
const L2_BITS: u32 = 9;
const L3_BITS: u32 = 5;
const VARIABLE_BITS: u32 = 1;
#[derive(Clone, Copy, Default)]
pub(crate) struct Elements {
pub(crate) buf: &'static [u64],
pub(crate) cur: u64,
}
impl Elements {
pub(crate) fn is_empty(&self) -> bool {
self.cur == 0 && self.buf.is_empty()
}
}
impl Iterator for Elements {
type Item = Element;
#[inline(always)]
fn next(&mut self) -> Option<Self::Item> {
if self.cur == 0 {
if self.buf.is_empty() {
return None;
}
self.cur = self.buf[0];
self.buf = &self.buf[1..];
}
let element = Element(((self.cur >> 1) & 0x7FFFFFFF) as u32);
self.cur >>= 32;
Some(element)
}
}
#[derive(Clone, Copy, PartialEq, Debug)]
pub(crate) struct Element(u32);
impl Element {
// l1 = 16 bits
// l2 = 9 bits
// l3 = 5 bits
// l4 = 0 bits
// variable = 1 bit
//
// total = 31 bits
pub(crate) fn new(l1: u16, l2: u16, l3: u8, l4: u16, variable: bool) -> Self {
debug_assert!(u16::BITS - l1.leading_zeros() <= L1_BITS);
debug_assert!(u16::BITS - l2.leading_zeros() <= L2_BITS);
debug_assert!(u8::BITS - l3.leading_zeros() <= L3_BITS);
debug_assert!(l4 == 0);
Self(
variable as u32
| (l1 as u32) << VARIABLE_BITS
| (l2 as u32) << (VARIABLE_BITS + L1_BITS)
| (l3 as u32) << (VARIABLE_BITS + L1_BITS + L2_BITS),
)
}
#[inline(always)]
pub(crate) fn l1(&self) -> u16 {
((self.0 >> VARIABLE_BITS) & 0xFFFF) as _
}
#[inline(always)]
pub(crate) fn l2(&self) -> u16 {
((self.0 >> (VARIABLE_BITS + L1_BITS)) & 0x1FF) as _
}
#[inline(always)]
pub(crate) fn l3(&self) -> u8 {
((self.0 >> (VARIABLE_BITS + L1_BITS + L2_BITS)) & 0x1F) as _
}
#[inline(always)]
pub(crate) fn l4(&self) -> u16 {
0
}
#[inline(always)]
fn is_variable(&self) -> bool {
(self.0 & 0x1) == 1
}
pub(crate) fn as_u32(&self) -> u32 {
self.0
}
}
impl Display for Element {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}{:04X}.{:04X}.{:04X}.{:04X}",
if self.is_variable() { "*" } else { "." },
self.l1(),
self.l2(),
self.l3(),
self.l4()
)
}
}
#[derive(Clone, Copy, PartialEq, Debug)]
pub(crate) struct Range(pub(crate) u64);
impl Range {
pub(crate) fn new(start: u32, len: u8) -> Self {
Self((len as u64) | (start as u64) << 8)
}
#[inline(always)]
pub(crate) fn start(&self) -> usize {
((self.0 >> 8) & 0xFFFFFFFF) as usize
}
#[inline(always)]
pub(crate) fn end(&self) -> usize {
self.start() + (self.0 & 0xFF) as usize
}
pub(crate) fn as_u64(&self) -> u64 {
self.0
}
}
pub(crate) fn pack(elements: &[Element]) -> u64 {
assert!(elements.len() <= 2);
let mut data = 0;
for e in elements.iter().rev() {
if data != 0 {
data <<= 32;
}
data |= (e.as_u32() as u64) << 1 | 1;
}
data
}
#[cfg(test)]
mod tests {
use proptest::prelude::*;
use super::*;
#[test]
fn test_elements() {
let a = Element::new(0x4593, 0x0020, 0x0016, 0x0000, false);
let b = Element::new(0x0000, 0x0119, 0x0016, 0x0000, false);
let e = Elements {
buf: &[],
cur: pack(&[a, b]),
};
assert_eq!(e.collect::<Vec<_>>(), vec![a, b]);
}
proptest! {
#[test]
fn proptest_element(l1 in any::<u16>(), l2 in (0u16..256), l3 in (0u8..32), variable in any::<bool>()) {
let r = Element::new(l1, l2, l3, 0, variable);
prop_assert_eq!(r.l1(), l1);
prop_assert_eq!(r.l2(), l2);
prop_assert_eq!(r.l3(), l3);
prop_assert_eq!(r.l4(), 0);
prop_assert_eq!(r.is_variable(), variable);
}
#[test]
fn proptest_range(start in any::<u32>(), len in any::<u8>()) {
let r = Range::new(start, len);
prop_assert_eq!(r.start(), start as usize);
prop_assert_eq!(r.end(), start as usize + len as usize);
}
}
}

View File

@@ -1,4 +1,5 @@
pub mod collator; pub mod collator;
mod element;
mod table; mod table;
mod weights; mod weights;

View File

@@ -1,11 +1,12 @@
use std::fmt::Display;
use u_fst::raw::{Fst, Output}; use u_fst::raw::{Fst, Output};
use crate::element::{Elements, Range};
use crate::weights::EXPLICIT_WEIGHTS;
const TABLE: Fst<&'static [u8]> = const TABLE: Fst<&'static [u8]> =
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst"))); Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
pub fn lookup(value: &[char]) -> Option<(Vec<Element>, usize)> { pub(crate) fn lookup(value: &[char]) -> Option<(Elements, usize)> {
let mut node = TABLE.root(); let mut node = TABLE.root();
let mut out = Output::zero(); let mut out = Output::zero();
@@ -30,132 +31,19 @@ pub fn lookup(value: &[char]) -> Option<(Vec<Element>, usize)> {
last_match.map(|(data, idx)| { last_match.map(|(data, idx)| {
( (
match Value::from_u64(data) { if data & 0x1 == 1 {
Value::Entry(element) => vec![element], Elements {
Value::Index(idx, len) => { buf: &[],
let start = idx as usize; cur: data,
let end = start + len as usize;
crate::weights::EXPLICIT_WEIGHTS[start..end]
.iter()
.map(|(l1, l2, l3, l4, variable)| Element {
l1: *l1,
l2: *l2,
l3: *l3,
l4: *l4,
variable: *variable,
})
.collect()
} }
} else {
let r = Range(data >> 1);
let (first, buf) = EXPLICIT_WEIGHTS[r.start()..r.end()].split_at(1);
Elements { buf, cur: first[0] }
}, },
idx, idx,
) )
}) })
} }
#[derive(Clone, Copy, PartialEq, Debug)]
pub struct Element {
pub l1: u16,
pub l2: u16,
pub l3: u8,
pub l4: u16,
pub variable: bool,
}
impl Display for Element {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}{:04X}.{:04X}.{:04X}.{:04X}",
if self.variable { "*" } else { "." },
self.l1,
self.l2,
self.l3,
self.l4
)?;
Ok(())
}
}
#[derive(Clone, Copy, PartialEq, Debug)]
enum Value {
Entry(Element),
Index(u32, u8),
}
impl Value {
fn to_u64(self) -> u64 {
match self {
Self::Entry(element) => {
((element.l4 as u64) << 42)
| ((element.l3 as u64) << 34)
| ((element.l2 as u64) << 18)
| ((element.l1 as u64) << 2)
| (if element.variable { 1 } else { 0 } << 1)
}
Self::Index(idx, len) => ((idx as u64) << 9) | ((len as u64) << 1) | 1,
}
}
fn from_u64(data: u64) -> Self {
if (data & 1) == 0 {
let variable = ((data >> 1) & 1) == 1;
let l1 = ((data >> 2) & 0xFFFF) as u16;
let l2 = ((data >> 18) & 0xFFFF) as u16;
let l3 = ((data >> 34) & 0xFF) as u8;
let l4 = ((data >> 42) & 0xFFFF) as u16;
Self::Entry(Element {
l1,
l2,
l3,
l4,
variable,
})
} else {
let len = ((data >> 1) & 0xFF) as u8;
let idx = ((data >> 9) & 0xFFFFFFFF) as u32;
Self::Index(idx, len)
}
}
}
#[cfg(test)]
mod tests {
use proptest::prelude::*;
use super::*;
fn value_strategy() -> impl Strategy<Value = Value> {
prop_oneof![
(any::<u32>(), any::<u8>()).prop_map(|(idx, len)| Value::Index(idx, len)),
(
any::<u16>(),
any::<u16>(),
any::<u8>(),
any::<u16>(),
any::<bool>()
)
.prop_map(|(l1, l2, l3, l4, variable)| Value::Entry(Element {
l1,
l2,
l3,
l4,
variable
})),
]
}
proptest! {
#[test]
fn proptest_serialize_and_deserialize(a in value_strategy()) {
let data = a.to_u64();
let b = Value::from_u64(data);
prop_assert_eq!(a, b);
}
}
}