use std::collections::HashMap; use std::env; use std::fs; use std::path::Path; use u_fst::raw::Fst; fn main() { let data = fs::read_to_string("data/UnicodeData.txt").unwrap(); let out_dir = env::var_os("OUT_DIR").unwrap(); let dest_path = Path::new(&out_dir).join("table.fst"); let (entries, mut classes) = parse(&data); classes.sort_unstable(); assert!( (usize::BITS - classes.len().leading_zeros()) <= 6, "classes: {:#?}", classes ); let mut entries = entries .iter() .map(|(code_value, (class, mappings))| { let mut data = classes.iter().position(|x| x == class).unwrap() as u64; assert!(mappings.len() <= 2); for (i, mapping) in mappings.iter().enumerate() { let (class, has_mappings) = entries .get(mapping) .map(|(class, mappings)| { ( classes.iter().position(|x| x == class).unwrap() as u64, !mappings.is_empty(), ) }) .unwrap_or((0, false)); let entry = (*mapping as u64) << 7 | (class as u64) << 1 | has_mappings as u64; data |= entry << ((28 * i) + 6); } (code_value.to_ne_bytes(), data) }) .collect::>(); entries.sort_unstable_by_key(|(k, _)| *k); let data = Fst::from_iter_map(entries).unwrap().into_inner(); fs::write(&dest_path, data).unwrap(); let mut class_map = String::new(); class_map.push_str(&format!("const CLASS_MAP: [u8; {}] = [\n", classes.len(),)); for class in classes { class_map.push_str(&format!(" {},\n", class)); } class_map.push_str("];\n"); fs::write(&Path::new(&out_dir).join("class_map.rs"), class_map).unwrap(); println!("cargo:rerun-if-changed=data/UnicodeData.txt"); println!("cargo:rerun-if-changed=build.rs"); } /// http://www.unicode.org/L2/L1999/UnicodeData.html fn parse(data: &str) -> (HashMap)>, Vec) { let mut entries = HashMap::new(); let mut classes = Vec::new(); for line in data.lines() { let mut iter = line.split(';'); let code_point = iter .next() .map(|s| u32::from_str_radix(s, 16).expect("valid u32")) .expect("code value"); let combining_class = iter .nth(2) .map(|s| s.parse::().expect("valid u8")) .expect("canonical combining classes"); if !classes.contains(&combining_class) { classes.push(combining_class); } let decomposition_mapping = iter.nth(1).unwrap(); let mappings = if !decomposition_mapping.starts_with('<') { decomposition_mapping .split(' ') .filter(|s| !s.is_empty()) .map(|s| u32::from_str_radix(s, 16).expect("valid u32")) .collect::>() } else { Vec::new() }; entries.insert(code_point, (combining_class, mappings)); } (entries, classes) }