From c9764d49ac80fdcf69eadf2ce4eaf0a52bd3899a Mon Sep 17 00:00:00 2001 From: Anders Olsson Date: Tue, 24 May 2022 22:34:57 +0200 Subject: [PATCH] Improve performance even more --- crates/u-norm/README.md | 2 + crates/u-norm/build.rs | 81 ++++++++++++++++++------ crates/u-norm/src/lib.rs | 27 ++++---- crates/u-norm/src/table.rs | 122 ++++++++++++++++++++++++++----------- 4 files changed, 165 insertions(+), 67 deletions(-) diff --git a/crates/u-norm/README.md b/crates/u-norm/README.md index fd0b117..b732f6f 100644 --- a/crates/u-norm/README.md +++ b/crates/u-norm/README.md @@ -1,5 +1,7 @@ # UNF +[UnicodeData.txt](http://www.unicode.org/L2/L1999/UnicodeData.html) + ## Todo - [ ] Change Decomposition to be `struct Decomposition(u64)` that implements `Iterator` diff --git a/crates/u-norm/build.rs b/crates/u-norm/build.rs index 3832533..efaf40d 100644 --- a/crates/u-norm/build.rs +++ b/crates/u-norm/build.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use std::env; use std::fs; use std::path::Path; @@ -10,9 +11,41 @@ fn main() { let out_dir = env::var_os("OUT_DIR").unwrap(); let dest_path = Path::new(&out_dir).join("table.fst"); - let mut entries = parse(&data) - .into_iter() - .map(|(code_value, entry)| (code_value.to_ne_bytes(), entry)) + let (entries, mut classes) = parse(&data); + + classes.sort_unstable(); + + assert!( + (usize::BITS - classes.len().leading_zeros()) <= 6, + "classes: {:#?}", + classes + ); + + let mut entries = entries + .iter() + .map(|(code_value, (class, mappings))| { + let mut data = classes.iter().position(|x| x == class).unwrap() as u64; + + assert!(mappings.len() <= 2); + + for (i, mapping) in mappings.iter().enumerate() { + let (class, has_mappings) = entries + .get(mapping) + .map(|(class, mappings)| { + ( + classes.iter().position(|x| x == class).unwrap() as u64, + !mappings.is_empty(), + ) + }) + .unwrap_or((0, false)); + + let entry = (*mapping as u64) << 7 | (class as u64) << 1 | has_mappings as u64; + + data |= entry << ((28 * i) + 6); + } + + (code_value.to_ne_bytes(), data) + }) .collect::>(); entries.sort_unstable_by_key(|(k, _)| *k); @@ -21,12 +54,26 @@ fn main() { fs::write(&dest_path, data).unwrap(); + let mut class_map = String::new(); + + class_map.push_str(&format!("const CLASS_MAP: [u8; {}] = [\n", classes.len(),)); + + for class in classes { + class_map.push_str(&format!(" {},\n", class)); + } + + class_map.push_str("];\n"); + + fs::write(&Path::new(&out_dir).join("class_map.rs"), class_map).unwrap(); + println!("cargo:rerun-if-changed=data/UnicodeData.txt"); println!("cargo:rerun-if-changed=build.rs"); } -fn parse(data: &str) -> Vec<(u32, u64)> { - let mut entries = Vec::new(); +/// http://www.unicode.org/L2/L1999/UnicodeData.html +fn parse(data: &str) -> (HashMap)>, Vec) { + let mut entries = HashMap::new(); + let mut classes = Vec::new(); for line in data.lines() { let mut iter = line.split(';'); @@ -41,26 +88,24 @@ fn parse(data: &str) -> Vec<(u32, u64)> { .map(|s| s.parse::().expect("valid u8")) .expect("canonical combining classes"); - let mut entry = combining_class as u64; + if !classes.contains(&combining_class) { + classes.push(combining_class); + } let decomposition_mapping = iter.nth(1).unwrap(); - if !decomposition_mapping.starts_with('<') { - let mappings = decomposition_mapping + let mappings = if !decomposition_mapping.starts_with('<') { + decomposition_mapping .split(' ') .filter(|s| !s.is_empty()) .map(|s| u32::from_str_radix(s, 16).expect("valid u32")) - .collect::>(); + .collect::>() + } else { + Vec::new() + }; - assert!(mappings.len() <= 2); - - for (i, mapping) in mappings.into_iter().enumerate() { - entry |= (mapping as u64) << ((21 * i) + 8); - } - } - - entries.push((code_point, entry)); + entries.insert(code_point, (combining_class, mappings)); } - entries + (entries, classes) } diff --git a/crates/u-norm/src/lib.rs b/crates/u-norm/src/lib.rs index cbc8c64..088fffb 100644 --- a/crates/u-norm/src/lib.rs +++ b/crates/u-norm/src/lib.rs @@ -27,17 +27,7 @@ impl Buffer { } #[inline(always)] - fn push_zero(&mut self, ch: char) { - self.sort_pending(); - - self.buffer.push((0, ch)); - self.ready.end = self.buffer.len(); - } - - #[inline(always)] - fn push_back(&mut self, ch: char) { - let class = table::lookup(ch).combining_class(); - + fn push(&mut self, ch: char, class: u8) { if class == 0 { self.sort_pending(); @@ -48,6 +38,11 @@ impl Buffer { } } + #[inline(always)] + fn push_back(&mut self, ch: char) { + self.push(ch, table::lookup(ch).combining_class()); + } + #[inline(always)] fn sort_pending(&mut self) { self.buffer[self.ready.end..].sort_by_key(|k| k.0); @@ -133,7 +128,7 @@ const S_COUNT: u32 = L_COUNT * N_COUNT; fn decompose(c: char, buffer: &mut Buffer) { // 7-bit ASCII never decomposes if c <= '\x7f' { - buffer.push_zero(c); + buffer.push(c, 0); return; } @@ -145,7 +140,13 @@ fn decompose(c: char, buffer: &mut Buffer) { if let Some(decomposed) = table::lookup(c).decomposition() { for d in decomposed { - decompose(d, buffer); + let c = d.char(); + + if d.has_decompositions() { + decompose(c, buffer); + } else { + buffer.push(c, d.combining_class()); + } } return; } diff --git a/crates/u-norm/src/table.rs b/crates/u-norm/src/table.rs index ea24d6f..51bdd8e 100644 --- a/crates/u-norm/src/table.rs +++ b/crates/u-norm/src/table.rs @@ -1,5 +1,7 @@ use u_fst::raw::Fst; +include!(concat!(env!("OUT_DIR"), "/class_map.rs")); + const TABLE: Fst<&'static [u8]> = Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst"))); @@ -13,26 +15,6 @@ pub fn lookup(ch: char) -> Entry { ) } -#[derive(Clone, Copy, PartialEq, Debug)] -pub struct Decomposition(u64); - -impl Iterator for Decomposition { - type Item = char; - - #[inline(always)] - fn next(&mut self) -> Option { - let d = (self.0 & 0x1FFFFF) as u32; - - if d > 0 { - self.0 >>= 21; - - Some(unsafe { char::from_u32_unchecked(d) }) - } else { - None - } - } -} - #[derive(Clone, Copy, PartialEq, Debug)] pub struct Entry(u64); @@ -43,44 +25,101 @@ impl Entry { #[inline(always)] pub fn combining_class(&self) -> u8 { - (self.0 & 0xFF) as u8 + CLASS_MAP[(self.0 & 0x3F) as usize] } #[inline(always)] - pub fn decomposition(&self) -> Option { - let data = self.0 >> 8; + pub fn decomposition(&self) -> Option { + let data = self.0 >> 6; if data > 0 { - Some(Decomposition(data)) + Some(Decompositions(data)) } else { None } } } +#[derive(Clone, Copy, PartialEq, Debug)] +pub struct Decompositions(u64); + +impl Iterator for Decompositions { + type Item = Decomposition; + + #[inline(always)] + fn next(&mut self) -> Option { + let d = (self.0 & 0xFFFFFFF) as u32; + + if d > 0 { + self.0 >>= 28; + + Some(Decomposition(d)) + } else { + None + } + } +} + +#[derive(Clone, Copy, PartialEq, Debug)] +pub struct Decomposition(u32); + +impl Decomposition { + #[inline(always)] + pub fn has_decompositions(&self) -> bool { + (self.0 & 0x1) == 1 + } + + #[inline(always)] + pub fn combining_class(&self) -> u8 { + CLASS_MAP[((self.0 >> 1) & 0x3F) as usize] + } + + #[inline(always)] + pub fn char(&self) -> char { + unsafe { char::from_u32_unchecked(((self.0 >> 7) & 0x1FFFFF) as u32) } + } +} + #[cfg(test)] mod tests { use proptest::prelude::*; use super::*; - fn entry_strategy() -> impl Strategy { + fn entry_strategy() -> impl Strategy { ( - any::(), + (0u8..CLASS_MAP.len() as u8), (0u8..2), + any::(), + (0u8..CLASS_MAP.len() as u8), any::().prop_filter("", |c| *c != '\u{0}'), + any::(), + (0u8..CLASS_MAP.len() as u8), any::().prop_filter("", |c| *c != '\u{0}'), ) .prop_map( - |(combining_class, mapping_count, decomposition_first, decomposition_second)| { + |( + combining_class, + mapping_count, + d1_has_mapping, + d1_class, + d1_char, + d2_has_mapping, + d2_class, + d2_char, + )| { let mut entry = combining_class as u64; if mapping_count > 0 { - entry |= (decomposition_first as u64) << 8; + entry |= (d1_char as u64) << ((6 + 1) + 6) + | (d1_class as u64) << ((1) + 6) + | (d1_has_mapping as u64) << (6); } if mapping_count > 1 { - entry |= (decomposition_second as u64) << (21 + 8); + entry |= (d1_char as u64) << ((6 + 1) + (21 + 6)) + | (d1_class as u64) << ((1) + (21 + 6)) + | (d1_has_mapping as u64) << (21 + 6); } ( @@ -88,8 +127,12 @@ mod tests { ( combining_class, mapping_count, - decomposition_first, - decomposition_second, + d1_has_mapping, + d1_class, + d1_char, + d2_has_mapping, + d2_class, + d2_char, ), ) }, @@ -99,18 +142,25 @@ mod tests { proptest! { #[test] fn proptest_entry_serialize_and_deserialize(a in entry_strategy()) { - let (data, (combining_class, mapping_count, decomposition_first, decomposition_second)) = a; + let (data, (combining_class, + mapping_count, + d1_has_mapping, + d1_class, + d1_char, + d2_has_mapping, + d2_class, + d2_char)) = a; let b = Entry::new(data); - prop_assert_eq!(b.combining_class(), combining_class, "data = {:064b}", data); + prop_assert_eq!(b.combining_class(), CLASS_MAP[combining_class as usize], "data = {:064b}", data); - let c = b.decomposition().map(|i| i.collect::>()); + let c = b.decomposition().map(|i| i.map(|d| (d.has_decompositions(), d.combining_class(), d.char())).collect::>()); match mapping_count { 0 => prop_assert_eq!(c, None, "data = {:064b}", data), - 1 => prop_assert_eq!(c, Some(vec![decomposition_first]), "data = {:064b}", data), - 2 => prop_assert_eq!(c, Some(vec![decomposition_first, decomposition_second]), "data = {:064b}", data), + 1 => prop_assert_eq!(c, Some(vec![(d1_has_mapping, CLASS_MAP[d1_class as usize], d1_char)]), "data = {:064b}", data), + 2 => prop_assert_eq!(c, Some(vec![(d1_has_mapping, CLASS_MAP[d1_class as usize], d1_char), (d2_has_mapping, CLASS_MAP[d2_class as usize], d2_char)]), "data = {:064b}", data), _ => unreachable!(), } }