Improve performance even more

This commit is contained in:
2022-05-24 22:34:57 +02:00
parent 17ea7bc60c
commit c9764d49ac
4 changed files with 165 additions and 67 deletions

View File

@@ -1,3 +1,4 @@
use std::collections::HashMap;
use std::env;
use std::fs;
use std::path::Path;
@@ -10,9 +11,41 @@ fn main() {
let out_dir = env::var_os("OUT_DIR").unwrap();
let dest_path = Path::new(&out_dir).join("table.fst");
let mut entries = parse(&data)
.into_iter()
.map(|(code_value, entry)| (code_value.to_ne_bytes(), entry))
let (entries, mut classes) = parse(&data);
classes.sort_unstable();
assert!(
(usize::BITS - classes.len().leading_zeros()) <= 6,
"classes: {:#?}",
classes
);
let mut entries = entries
.iter()
.map(|(code_value, (class, mappings))| {
let mut data = classes.iter().position(|x| x == class).unwrap() as u64;
assert!(mappings.len() <= 2);
for (i, mapping) in mappings.iter().enumerate() {
let (class, has_mappings) = entries
.get(mapping)
.map(|(class, mappings)| {
(
classes.iter().position(|x| x == class).unwrap() as u64,
!mappings.is_empty(),
)
})
.unwrap_or((0, false));
let entry = (*mapping as u64) << 7 | (class as u64) << 1 | has_mappings as u64;
data |= entry << ((28 * i) + 6);
}
(code_value.to_ne_bytes(), data)
})
.collect::<Vec<_>>();
entries.sort_unstable_by_key(|(k, _)| *k);
@@ -21,12 +54,26 @@ fn main() {
fs::write(&dest_path, data).unwrap();
let mut class_map = String::new();
class_map.push_str(&format!("const CLASS_MAP: [u8; {}] = [\n", classes.len(),));
for class in classes {
class_map.push_str(&format!(" {},\n", class));
}
class_map.push_str("];\n");
fs::write(&Path::new(&out_dir).join("class_map.rs"), class_map).unwrap();
println!("cargo:rerun-if-changed=data/UnicodeData.txt");
println!("cargo:rerun-if-changed=build.rs");
}
fn parse(data: &str) -> Vec<(u32, u64)> {
let mut entries = Vec::new();
/// http://www.unicode.org/L2/L1999/UnicodeData.html
fn parse(data: &str) -> (HashMap<u32, (u8, Vec<u32>)>, Vec<u8>) {
let mut entries = HashMap::new();
let mut classes = Vec::new();
for line in data.lines() {
let mut iter = line.split(';');
@@ -41,26 +88,24 @@ fn parse(data: &str) -> Vec<(u32, u64)> {
.map(|s| s.parse::<u8>().expect("valid u8"))
.expect("canonical combining classes");
let mut entry = combining_class as u64;
if !classes.contains(&combining_class) {
classes.push(combining_class);
}
let decomposition_mapping = iter.nth(1).unwrap();
if !decomposition_mapping.starts_with('<') {
let mappings = decomposition_mapping
let mappings = if !decomposition_mapping.starts_with('<') {
decomposition_mapping
.split(' ')
.filter(|s| !s.is_empty())
.map(|s| u32::from_str_radix(s, 16).expect("valid u32"))
.collect::<Vec<_>>();
.collect::<Vec<_>>()
} else {
Vec::new()
};
assert!(mappings.len() <= 2);
for (i, mapping) in mappings.into_iter().enumerate() {
entry |= (mapping as u64) << ((21 * i) + 8);
}
}
entries.push((code_point, entry));
entries.insert(code_point, (combining_class, mappings));
}
entries
(entries, classes)
}