Improve performance even more
This commit is contained in:
@@ -1,5 +1,7 @@
|
|||||||
# UNF
|
# UNF
|
||||||
|
|
||||||
|
[UnicodeData.txt](http://www.unicode.org/L2/L1999/UnicodeData.html)
|
||||||
|
|
||||||
## Todo
|
## Todo
|
||||||
|
|
||||||
- [ ] Change Decomposition to be `struct Decomposition(u64)` that implements `Iterator`
|
- [ ] Change Decomposition to be `struct Decomposition(u64)` that implements `Iterator`
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
use std::env;
|
use std::env;
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
@@ -10,9 +11,41 @@ fn main() {
|
|||||||
let out_dir = env::var_os("OUT_DIR").unwrap();
|
let out_dir = env::var_os("OUT_DIR").unwrap();
|
||||||
let dest_path = Path::new(&out_dir).join("table.fst");
|
let dest_path = Path::new(&out_dir).join("table.fst");
|
||||||
|
|
||||||
let mut entries = parse(&data)
|
let (entries, mut classes) = parse(&data);
|
||||||
.into_iter()
|
|
||||||
.map(|(code_value, entry)| (code_value.to_ne_bytes(), entry))
|
classes.sort_unstable();
|
||||||
|
|
||||||
|
assert!(
|
||||||
|
(usize::BITS - classes.len().leading_zeros()) <= 6,
|
||||||
|
"classes: {:#?}",
|
||||||
|
classes
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut entries = entries
|
||||||
|
.iter()
|
||||||
|
.map(|(code_value, (class, mappings))| {
|
||||||
|
let mut data = classes.iter().position(|x| x == class).unwrap() as u64;
|
||||||
|
|
||||||
|
assert!(mappings.len() <= 2);
|
||||||
|
|
||||||
|
for (i, mapping) in mappings.iter().enumerate() {
|
||||||
|
let (class, has_mappings) = entries
|
||||||
|
.get(mapping)
|
||||||
|
.map(|(class, mappings)| {
|
||||||
|
(
|
||||||
|
classes.iter().position(|x| x == class).unwrap() as u64,
|
||||||
|
!mappings.is_empty(),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.unwrap_or((0, false));
|
||||||
|
|
||||||
|
let entry = (*mapping as u64) << 7 | (class as u64) << 1 | has_mappings as u64;
|
||||||
|
|
||||||
|
data |= entry << ((28 * i) + 6);
|
||||||
|
}
|
||||||
|
|
||||||
|
(code_value.to_ne_bytes(), data)
|
||||||
|
})
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
entries.sort_unstable_by_key(|(k, _)| *k);
|
entries.sort_unstable_by_key(|(k, _)| *k);
|
||||||
@@ -21,12 +54,26 @@ fn main() {
|
|||||||
|
|
||||||
fs::write(&dest_path, data).unwrap();
|
fs::write(&dest_path, data).unwrap();
|
||||||
|
|
||||||
|
let mut class_map = String::new();
|
||||||
|
|
||||||
|
class_map.push_str(&format!("const CLASS_MAP: [u8; {}] = [\n", classes.len(),));
|
||||||
|
|
||||||
|
for class in classes {
|
||||||
|
class_map.push_str(&format!(" {},\n", class));
|
||||||
|
}
|
||||||
|
|
||||||
|
class_map.push_str("];\n");
|
||||||
|
|
||||||
|
fs::write(&Path::new(&out_dir).join("class_map.rs"), class_map).unwrap();
|
||||||
|
|
||||||
println!("cargo:rerun-if-changed=data/UnicodeData.txt");
|
println!("cargo:rerun-if-changed=data/UnicodeData.txt");
|
||||||
println!("cargo:rerun-if-changed=build.rs");
|
println!("cargo:rerun-if-changed=build.rs");
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse(data: &str) -> Vec<(u32, u64)> {
|
/// http://www.unicode.org/L2/L1999/UnicodeData.html
|
||||||
let mut entries = Vec::new();
|
fn parse(data: &str) -> (HashMap<u32, (u8, Vec<u32>)>, Vec<u8>) {
|
||||||
|
let mut entries = HashMap::new();
|
||||||
|
let mut classes = Vec::new();
|
||||||
|
|
||||||
for line in data.lines() {
|
for line in data.lines() {
|
||||||
let mut iter = line.split(';');
|
let mut iter = line.split(';');
|
||||||
@@ -41,26 +88,24 @@ fn parse(data: &str) -> Vec<(u32, u64)> {
|
|||||||
.map(|s| s.parse::<u8>().expect("valid u8"))
|
.map(|s| s.parse::<u8>().expect("valid u8"))
|
||||||
.expect("canonical combining classes");
|
.expect("canonical combining classes");
|
||||||
|
|
||||||
let mut entry = combining_class as u64;
|
if !classes.contains(&combining_class) {
|
||||||
|
classes.push(combining_class);
|
||||||
|
}
|
||||||
|
|
||||||
let decomposition_mapping = iter.nth(1).unwrap();
|
let decomposition_mapping = iter.nth(1).unwrap();
|
||||||
|
|
||||||
if !decomposition_mapping.starts_with('<') {
|
let mappings = if !decomposition_mapping.starts_with('<') {
|
||||||
let mappings = decomposition_mapping
|
decomposition_mapping
|
||||||
.split(' ')
|
.split(' ')
|
||||||
.filter(|s| !s.is_empty())
|
.filter(|s| !s.is_empty())
|
||||||
.map(|s| u32::from_str_radix(s, 16).expect("valid u32"))
|
.map(|s| u32::from_str_radix(s, 16).expect("valid u32"))
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>()
|
||||||
|
} else {
|
||||||
|
Vec::new()
|
||||||
|
};
|
||||||
|
|
||||||
assert!(mappings.len() <= 2);
|
entries.insert(code_point, (combining_class, mappings));
|
||||||
|
|
||||||
for (i, mapping) in mappings.into_iter().enumerate() {
|
|
||||||
entry |= (mapping as u64) << ((21 * i) + 8);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
entries.push((code_point, entry));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
entries
|
(entries, classes)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,17 +27,7 @@ impl Buffer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
fn push_zero(&mut self, ch: char) {
|
fn push(&mut self, ch: char, class: u8) {
|
||||||
self.sort_pending();
|
|
||||||
|
|
||||||
self.buffer.push((0, ch));
|
|
||||||
self.ready.end = self.buffer.len();
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn push_back(&mut self, ch: char) {
|
|
||||||
let class = table::lookup(ch).combining_class();
|
|
||||||
|
|
||||||
if class == 0 {
|
if class == 0 {
|
||||||
self.sort_pending();
|
self.sort_pending();
|
||||||
|
|
||||||
@@ -48,6 +38,11 @@ impl Buffer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn push_back(&mut self, ch: char) {
|
||||||
|
self.push(ch, table::lookup(ch).combining_class());
|
||||||
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
fn sort_pending(&mut self) {
|
fn sort_pending(&mut self) {
|
||||||
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
|
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
|
||||||
@@ -133,7 +128,7 @@ const S_COUNT: u32 = L_COUNT * N_COUNT;
|
|||||||
fn decompose(c: char, buffer: &mut Buffer) {
|
fn decompose(c: char, buffer: &mut Buffer) {
|
||||||
// 7-bit ASCII never decomposes
|
// 7-bit ASCII never decomposes
|
||||||
if c <= '\x7f' {
|
if c <= '\x7f' {
|
||||||
buffer.push_zero(c);
|
buffer.push(c, 0);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -145,7 +140,13 @@ fn decompose(c: char, buffer: &mut Buffer) {
|
|||||||
|
|
||||||
if let Some(decomposed) = table::lookup(c).decomposition() {
|
if let Some(decomposed) = table::lookup(c).decomposition() {
|
||||||
for d in decomposed {
|
for d in decomposed {
|
||||||
decompose(d, buffer);
|
let c = d.char();
|
||||||
|
|
||||||
|
if d.has_decompositions() {
|
||||||
|
decompose(c, buffer);
|
||||||
|
} else {
|
||||||
|
buffer.push(c, d.combining_class());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
use u_fst::raw::Fst;
|
use u_fst::raw::Fst;
|
||||||
|
|
||||||
|
include!(concat!(env!("OUT_DIR"), "/class_map.rs"));
|
||||||
|
|
||||||
const TABLE: Fst<&'static [u8]> =
|
const TABLE: Fst<&'static [u8]> =
|
||||||
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
|
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
|
||||||
|
|
||||||
@@ -13,26 +15,6 @@ pub fn lookup(ch: char) -> Entry {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy, PartialEq, Debug)]
|
|
||||||
pub struct Decomposition(u64);
|
|
||||||
|
|
||||||
impl Iterator for Decomposition {
|
|
||||||
type Item = char;
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
|
||||||
let d = (self.0 & 0x1FFFFF) as u32;
|
|
||||||
|
|
||||||
if d > 0 {
|
|
||||||
self.0 >>= 21;
|
|
||||||
|
|
||||||
Some(unsafe { char::from_u32_unchecked(d) })
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone, Copy, PartialEq, Debug)]
|
#[derive(Clone, Copy, PartialEq, Debug)]
|
||||||
pub struct Entry(u64);
|
pub struct Entry(u64);
|
||||||
|
|
||||||
@@ -43,44 +25,101 @@ impl Entry {
|
|||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
pub fn combining_class(&self) -> u8 {
|
pub fn combining_class(&self) -> u8 {
|
||||||
(self.0 & 0xFF) as u8
|
CLASS_MAP[(self.0 & 0x3F) as usize]
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
pub fn decomposition(&self) -> Option<Decomposition> {
|
pub fn decomposition(&self) -> Option<Decompositions> {
|
||||||
let data = self.0 >> 8;
|
let data = self.0 >> 6;
|
||||||
|
|
||||||
if data > 0 {
|
if data > 0 {
|
||||||
Some(Decomposition(data))
|
Some(Decompositions(data))
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, PartialEq, Debug)]
|
||||||
|
pub struct Decompositions(u64);
|
||||||
|
|
||||||
|
impl Iterator for Decompositions {
|
||||||
|
type Item = Decomposition;
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
let d = (self.0 & 0xFFFFFFF) as u32;
|
||||||
|
|
||||||
|
if d > 0 {
|
||||||
|
self.0 >>= 28;
|
||||||
|
|
||||||
|
Some(Decomposition(d))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, PartialEq, Debug)]
|
||||||
|
pub struct Decomposition(u32);
|
||||||
|
|
||||||
|
impl Decomposition {
|
||||||
|
#[inline(always)]
|
||||||
|
pub fn has_decompositions(&self) -> bool {
|
||||||
|
(self.0 & 0x1) == 1
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
pub fn combining_class(&self) -> u8 {
|
||||||
|
CLASS_MAP[((self.0 >> 1) & 0x3F) as usize]
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
pub fn char(&self) -> char {
|
||||||
|
unsafe { char::from_u32_unchecked(((self.0 >> 7) & 0x1FFFFF) as u32) }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use proptest::prelude::*;
|
use proptest::prelude::*;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
fn entry_strategy() -> impl Strategy<Value = (u64, (u8, u8, char, char))> {
|
fn entry_strategy() -> impl Strategy<Value = (u64, (u8, u8, bool, u8, char, bool, u8, char))> {
|
||||||
(
|
(
|
||||||
any::<u8>(),
|
(0u8..CLASS_MAP.len() as u8),
|
||||||
(0u8..2),
|
(0u8..2),
|
||||||
|
any::<bool>(),
|
||||||
|
(0u8..CLASS_MAP.len() as u8),
|
||||||
any::<char>().prop_filter("", |c| *c != '\u{0}'),
|
any::<char>().prop_filter("", |c| *c != '\u{0}'),
|
||||||
|
any::<bool>(),
|
||||||
|
(0u8..CLASS_MAP.len() as u8),
|
||||||
any::<char>().prop_filter("", |c| *c != '\u{0}'),
|
any::<char>().prop_filter("", |c| *c != '\u{0}'),
|
||||||
)
|
)
|
||||||
.prop_map(
|
.prop_map(
|
||||||
|(combining_class, mapping_count, decomposition_first, decomposition_second)| {
|
|(
|
||||||
|
combining_class,
|
||||||
|
mapping_count,
|
||||||
|
d1_has_mapping,
|
||||||
|
d1_class,
|
||||||
|
d1_char,
|
||||||
|
d2_has_mapping,
|
||||||
|
d2_class,
|
||||||
|
d2_char,
|
||||||
|
)| {
|
||||||
let mut entry = combining_class as u64;
|
let mut entry = combining_class as u64;
|
||||||
|
|
||||||
if mapping_count > 0 {
|
if mapping_count > 0 {
|
||||||
entry |= (decomposition_first as u64) << 8;
|
entry |= (d1_char as u64) << ((6 + 1) + 6)
|
||||||
|
| (d1_class as u64) << ((1) + 6)
|
||||||
|
| (d1_has_mapping as u64) << (6);
|
||||||
}
|
}
|
||||||
|
|
||||||
if mapping_count > 1 {
|
if mapping_count > 1 {
|
||||||
entry |= (decomposition_second as u64) << (21 + 8);
|
entry |= (d1_char as u64) << ((6 + 1) + (21 + 6))
|
||||||
|
| (d1_class as u64) << ((1) + (21 + 6))
|
||||||
|
| (d1_has_mapping as u64) << (21 + 6);
|
||||||
}
|
}
|
||||||
|
|
||||||
(
|
(
|
||||||
@@ -88,8 +127,12 @@ mod tests {
|
|||||||
(
|
(
|
||||||
combining_class,
|
combining_class,
|
||||||
mapping_count,
|
mapping_count,
|
||||||
decomposition_first,
|
d1_has_mapping,
|
||||||
decomposition_second,
|
d1_class,
|
||||||
|
d1_char,
|
||||||
|
d2_has_mapping,
|
||||||
|
d2_class,
|
||||||
|
d2_char,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
},
|
},
|
||||||
@@ -99,18 +142,25 @@ mod tests {
|
|||||||
proptest! {
|
proptest! {
|
||||||
#[test]
|
#[test]
|
||||||
fn proptest_entry_serialize_and_deserialize(a in entry_strategy()) {
|
fn proptest_entry_serialize_and_deserialize(a in entry_strategy()) {
|
||||||
let (data, (combining_class, mapping_count, decomposition_first, decomposition_second)) = a;
|
let (data, (combining_class,
|
||||||
|
mapping_count,
|
||||||
|
d1_has_mapping,
|
||||||
|
d1_class,
|
||||||
|
d1_char,
|
||||||
|
d2_has_mapping,
|
||||||
|
d2_class,
|
||||||
|
d2_char)) = a;
|
||||||
|
|
||||||
let b = Entry::new(data);
|
let b = Entry::new(data);
|
||||||
|
|
||||||
prop_assert_eq!(b.combining_class(), combining_class, "data = {:064b}", data);
|
prop_assert_eq!(b.combining_class(), CLASS_MAP[combining_class as usize], "data = {:064b}", data);
|
||||||
|
|
||||||
let c = b.decomposition().map(|i| i.collect::<Vec<_>>());
|
let c = b.decomposition().map(|i| i.map(|d| (d.has_decompositions(), d.combining_class(), d.char())).collect::<Vec<_>>());
|
||||||
|
|
||||||
match mapping_count {
|
match mapping_count {
|
||||||
0 => prop_assert_eq!(c, None, "data = {:064b}", data),
|
0 => prop_assert_eq!(c, None, "data = {:064b}", data),
|
||||||
1 => prop_assert_eq!(c, Some(vec![decomposition_first]), "data = {:064b}", data),
|
1 => prop_assert_eq!(c, Some(vec![(d1_has_mapping, CLASS_MAP[d1_class as usize], d1_char)]), "data = {:064b}", data),
|
||||||
2 => prop_assert_eq!(c, Some(vec![decomposition_first, decomposition_second]), "data = {:064b}", data),
|
2 => prop_assert_eq!(c, Some(vec![(d1_has_mapping, CLASS_MAP[d1_class as usize], d1_char), (d2_has_mapping, CLASS_MAP[d2_class as usize], d2_char)]), "data = {:064b}", data),
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user