112 lines
3.1 KiB
Rust
112 lines
3.1 KiB
Rust
use std::collections::HashMap;
|
|
use std::env;
|
|
use std::fs;
|
|
use std::path::Path;
|
|
|
|
use u_fst::raw::Fst;
|
|
|
|
fn main() {
|
|
let data = fs::read_to_string("data/UnicodeData.txt").unwrap();
|
|
|
|
let out_dir = env::var_os("OUT_DIR").unwrap();
|
|
let dest_path = Path::new(&out_dir).join("table.fst");
|
|
|
|
let (entries, mut classes) = parse(&data);
|
|
|
|
classes.sort_unstable();
|
|
|
|
assert!(
|
|
(usize::BITS - classes.len().leading_zeros()) <= 6,
|
|
"classes: {:#?}",
|
|
classes
|
|
);
|
|
|
|
let mut entries = entries
|
|
.iter()
|
|
.map(|(code_value, (class, mappings))| {
|
|
let mut data = classes.iter().position(|x| x == class).unwrap() as u64;
|
|
|
|
assert!(mappings.len() <= 2);
|
|
|
|
for (i, mapping) in mappings.iter().enumerate() {
|
|
let (class, has_mappings) = entries
|
|
.get(mapping)
|
|
.map(|(class, mappings)| {
|
|
(
|
|
classes.iter().position(|x| x == class).unwrap() as u64,
|
|
!mappings.is_empty(),
|
|
)
|
|
})
|
|
.unwrap_or((0, false));
|
|
|
|
let entry = (*mapping as u64) << 7 | (class as u64) << 1 | has_mappings as u64;
|
|
|
|
data |= entry << ((28 * i) + 6);
|
|
}
|
|
|
|
(code_value.to_ne_bytes(), data)
|
|
})
|
|
.collect::<Vec<_>>();
|
|
|
|
entries.sort_unstable_by_key(|(k, _)| *k);
|
|
|
|
let data = Fst::from_iter_map(entries).unwrap().into_inner();
|
|
|
|
fs::write(&dest_path, data).unwrap();
|
|
|
|
let mut class_map = String::new();
|
|
|
|
class_map.push_str(&format!("const CLASS_MAP: [u8; {}] = [\n", classes.len(),));
|
|
|
|
for class in classes {
|
|
class_map.push_str(&format!(" {},\n", class));
|
|
}
|
|
|
|
class_map.push_str("];\n");
|
|
|
|
fs::write(&Path::new(&out_dir).join("class_map.rs"), class_map).unwrap();
|
|
|
|
println!("cargo:rerun-if-changed=data/UnicodeData.txt");
|
|
println!("cargo:rerun-if-changed=build.rs");
|
|
}
|
|
|
|
/// http://www.unicode.org/L2/L1999/UnicodeData.html
|
|
fn parse(data: &str) -> (HashMap<u32, (u8, Vec<u32>)>, Vec<u8>) {
|
|
let mut entries = HashMap::new();
|
|
let mut classes = Vec::new();
|
|
|
|
for line in data.lines() {
|
|
let mut iter = line.split(';');
|
|
|
|
let code_point = iter
|
|
.next()
|
|
.map(|s| u32::from_str_radix(s, 16).expect("valid u32"))
|
|
.expect("code value");
|
|
|
|
let combining_class = iter
|
|
.nth(2)
|
|
.map(|s| s.parse::<u8>().expect("valid u8"))
|
|
.expect("canonical combining classes");
|
|
|
|
if !classes.contains(&combining_class) {
|
|
classes.push(combining_class);
|
|
}
|
|
|
|
let decomposition_mapping = iter.nth(1).unwrap();
|
|
|
|
let mappings = if !decomposition_mapping.starts_with('<') {
|
|
decomposition_mapping
|
|
.split(' ')
|
|
.filter(|s| !s.is_empty())
|
|
.map(|s| u32::from_str_radix(s, 16).expect("valid u32"))
|
|
.collect::<Vec<_>>()
|
|
} else {
|
|
Vec::new()
|
|
};
|
|
|
|
entries.insert(code_point, (combining_class, mappings));
|
|
}
|
|
|
|
(entries, classes)
|
|
}
|