Rename crates

2022-05-24 20:58:27 +02:00
parent 8a8baffba8
commit 9f44196e6c
51 changed files with 2531 additions and 54 deletions
--- a/crates/u-norm/Cargo.toml
+++ b/crates/u-norm/Cargo.toml
@@ -0,0 +1,25 @@
+[package]
+name = "u-norm"
+version = "0.1.0"
+edition = "2021"
+
+[lib]
+bench = false
+
+[[bench]]
+name = "bench"
+harness = false
+
+[dependencies]
+fst = "0.4.7"
+tinyvec = { version = "1.6.0", features = ["alloc"] }
+u-fst = { path = "../u-fst" }
+
+[build-dependencies]
+u-fst = { path = "../u-fst" }
+
+[dev-dependencies]
+criterion = "0.3.5"
+proptest = "1.0.0"
+similar-asserts = "1.2.0"
+unicode-normalization = "0.1.19"
--- a/crates/u-norm/README.md
+++ b/crates/u-norm/README.md
@@ -0,0 +1,5 @@
+# UNF
+
+## Todo
+
+- [ ] Change Decomposition to be `struct Decomposition(u64)` that implements `Iterator`
--- a/crates/u-norm/benches/bench.rs
+++ b/crates/u-norm/benches/bench.rs
@@ -0,0 +1,29 @@
+use std::fs;
+
+use criterion::{criterion_group, criterion_main, Criterion};
+
+use u_norm::nfd;
+use unicode_normalization::UnicodeNormalization;
+
+const ASCII: &str = "all types of normalized";
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let mut group = c.benchmark_group("ASCII");
+
+    group.bench_function("unf", |b| b.iter(|| nfd(ASCII).count()));
+    group.bench_function("unicode-normalization", |b| b.iter(|| ASCII.nfd().count()));
+
+    group.finish();
+
+    let long = fs::read_to_string("benches/long.txt").unwrap();
+
+    let mut group = c.benchmark_group("Long");
+
+    group.bench_function("unf", |b| b.iter(|| nfd(&long).count()));
+    group.bench_function("unicode-normalization", |b| b.iter(|| long.nfd().count()));
+
+    group.finish();
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
--- a/crates/u-norm/benches/long.txt
+++ b/crates/u-norm/benches/long.txt
--- a/crates/u-norm/build.rs
+++ b/crates/u-norm/build.rs
@@ -0,0 +1,66 @@
+use std::env;
+use std::fs;
+use std::path::Path;
+
+use u_fst::raw::Fst;
+
+fn main() {
+    let data = fs::read_to_string("data/UnicodeData.txt").unwrap();
+
+    let out_dir = env::var_os("OUT_DIR").unwrap();
+    let dest_path = Path::new(&out_dir).join("table.fst");
+
+    let mut entries = parse(&data)
+        .into_iter()
+        .map(|(code_value, entry)| (code_value.to_ne_bytes(), entry))
+        .collect::<Vec<_>>();
+
+    entries.sort_unstable_by_key(|(k, _)| *k);
+
+    let data = Fst::from_iter_map(entries).unwrap().into_inner();
+
+    fs::write(&dest_path, data).unwrap();
+
+    println!("cargo:rerun-if-changed=data/UnicodeData.txt");
+    println!("cargo:rerun-if-changed=build.rs");
+}
+
+fn parse(data: &str) -> Vec<(u32, u64)> {
+    let mut entries = Vec::new();
+
+    for line in data.lines() {
+        let mut iter = line.split(';');
+
+        let code_point = iter
+            .next()
+            .map(|s| u32::from_str_radix(s, 16).expect("valid u32"))
+            .expect("code value");
+
+        let combining_class = iter
+            .nth(2)
+            .map(|s| s.parse::<u8>().expect("valid u8"))
+            .expect("canonical combining classes");
+
+        let mut entry = combining_class as u64;
+
+        let decomposition_mapping = iter.nth(1).unwrap();
+
+        if !decomposition_mapping.starts_with('<') {
+            let mappings = decomposition_mapping
+                .split(' ')
+                .filter(|s| !s.is_empty())
+                .map(|s| u32::from_str_radix(s, 16).expect("valid u32"))
+                .collect::<Vec<_>>();
+
+            assert!(mappings.len() <= 2);
+
+            for (i, mapping) in mappings.into_iter().enumerate() {
+                entry |= (mapping as u64) << ((21 * i) + 8);
+            }
+        }
+
+        entries.push((code_point, entry));
+    }
+
+    entries
+}
--- a/crates/u-norm/data/DerivedNormalizationProps.txt
+++ b/crates/u-norm/data/DerivedNormalizationProps.txt
--- a/crates/u-norm/data/NormalizationTest.txt
+++ b/crates/u-norm/data/NormalizationTest.txt
--- a/crates/u-norm/data/StandardizedVariants.txt
+++ b/crates/u-norm/data/StandardizedVariants.txt
--- a/crates/u-norm/data/UnicodeData.txt
+++ b/crates/u-norm/data/UnicodeData.txt
--- a/crates/u-norm/src/lib.rs
+++ b/crates/u-norm/src/lib.rs
@@ -0,0 +1,305 @@
+use std::iter::Fuse;
+use std::ops::Range;
+use std::str::Chars;
+
+use tinyvec::TinyVec;
+
+pub mod table;
+
+pub fn nfd(s: &str) -> Decompositions<Chars<'_>> {
+    Decompositions {
+        iter: s.chars().fuse(),
+        buffer: Buffer::new(),
+    }
+}
+
+struct Buffer {
+    buffer: TinyVec<[(u8, char); 4]>,
+    ready: Range<usize>,
+}
+
+impl Buffer {
+    fn new() -> Self {
+        Self {
+            buffer: TinyVec::new(),
+            ready: 0..0,
+        }
+    }
+
+    #[inline(always)]
+    fn push_back(&mut self, ch: char) {
+        let class = table::lookup(ch).combining_class();
+
+        if class == 0 {
+            self.sort_pending();
+
+            self.buffer.push((class, ch));
+            self.ready.end = self.buffer.len();
+        } else {
+            self.buffer.push((class, ch));
+        }
+    }
+
+    #[inline(always)]
+    fn sort_pending(&mut self) {
+        self.buffer[self.ready.end..].sort_by_key(|k| k.0);
+    }
+
+    #[inline(always)]
+    fn reset(&mut self) {
+        let pending = self.buffer.len() - self.ready.end;
+
+        for i in 0..pending {
+            self.buffer[i] = self.buffer[i + self.ready.end];
+        }
+
+        self.buffer.truncate(pending);
+        self.ready = 0..0;
+    }
+
+    #[inline(always)]
+    fn increment_next_ready(&mut self) {
+        let next = self.ready.start + 1;
+
+        if next == self.ready.end {
+            self.reset();
+        } else {
+            self.ready.start = next;
+        }
+    }
+}
+
+pub struct Decompositions<I> {
+    iter: Fuse<I>,
+    buffer: Buffer,
+}
+
+impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
+    type Item = char;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        while self.buffer.ready.end == 0 {
+            match self.iter.next() {
+                Some(ch) => {
+                    decompose(ch, &mut self.buffer);
+                }
+                None => {
+                    if self.buffer.buffer.is_empty() {
+                        return None;
+                    } else {
+                        self.buffer.sort_pending();
+                        self.buffer.ready.end = self.buffer.buffer.len();
+
+                        break;
+                    }
+                }
+            }
+        }
+
+        let (_, ch) = self.buffer.buffer[self.buffer.ready.start];
+
+        self.buffer.increment_next_ready();
+
+        Some(ch)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let (lower, _) = self.iter.size_hint();
+
+        (lower, None)
+    }
+}
+
+// Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
+// http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
+const S_BASE: u32 = 0xAC00;
+const L_BASE: u32 = 0x1100;
+const V_BASE: u32 = 0x1161;
+const T_BASE: u32 = 0x11A7;
+const L_COUNT: u32 = 19;
+const V_COUNT: u32 = 21;
+const T_COUNT: u32 = 28;
+const N_COUNT: u32 = V_COUNT * T_COUNT;
+const S_COUNT: u32 = L_COUNT * N_COUNT;
+
+fn decompose(c: char, buffer: &mut Buffer) {
+    // 7-bit ASCII never decomposes
+    if c <= '\x7f' {
+        buffer.push_back(c);
+        return;
+    }
+
+    // Perform decomposition for Hangul
+    if is_hangul_syllable(c) {
+        decompose_hangul(c, buffer);
+        return;
+    }
+
+    if let Some(decomposed) = table::lookup(c).decomposition() {
+        for d in decomposed {
+            decompose(d, buffer);
+        }
+        return;
+    }
+
+    // Finally bottom out.
+    buffer.push_back(c);
+}
+
+fn is_hangul_syllable(c: char) -> bool {
+    (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT)
+}
+
+#[allow(unsafe_code)]
+fn decompose_hangul(s: char, buffer: &mut Buffer) {
+    let s_index = s as u32 - S_BASE;
+    let l_index = s_index / N_COUNT;
+
+    unsafe {
+        buffer.push_back(char::from_u32_unchecked(L_BASE + l_index));
+
+        let v_index = (s_index % N_COUNT) / T_COUNT;
+        buffer.push_back(char::from_u32_unchecked(V_BASE + v_index));
+
+        let t_index = s_index % T_COUNT;
+        if t_index > 0 {
+            buffer.push_back(char::from_u32_unchecked(T_BASE + t_index));
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::fs::File;
+    use std::io::{BufRead, BufReader};
+
+    use super::*;
+
+    #[test]
+    fn test_unicode_normalization() {
+        let data = File::open("data/NormalizationTest.txt")
+            .map(BufReader::new)
+            .expect("unicode normalization test file");
+
+        #[derive(Default)]
+        struct Entry {
+            source: String,
+            nfc: String,
+            nfd: String,
+            nfkc: String,
+            nfkd: String,
+            comment: String,
+        }
+
+        for (i, line) in data.lines().enumerate() {
+            let line = line.expect("line");
+
+            if line.is_empty() {
+                continue;
+            }
+
+            if line.starts_with(['#', '@']) {
+                continue;
+            }
+
+            let entry =
+                line.splitn(6, ';')
+                    .enumerate()
+                    .fold(Entry::default(), |mut entry, (i, string)| {
+                        match i {
+                            0 => {
+                                entry.source = string
+                                    .split(' ')
+                                    .map(|v| {
+                                        u32::from_str_radix(v, 16).expect("valid u32 value as hex")
+                                    })
+                                    .map(|v| char::from_u32(v).expect("valid char"))
+                                    .collect::<String>()
+                            }
+                            1 => {
+                                entry.nfc = string
+                                    .split(' ')
+                                    .map(|v| {
+                                        u32::from_str_radix(v, 16).expect("valid u32 value as hex")
+                                    })
+                                    .map(|v| char::from_u32(v).expect("valid char"))
+                                    .collect::<String>()
+                            }
+                            2 => {
+                                entry.nfd = string
+                                    .split(' ')
+                                    .map(|v| {
+                                        u32::from_str_radix(v, 16).expect("valid u32 value as hex")
+                                    })
+                                    .map(|v| char::from_u32(v).expect("valid char"))
+                                    .collect::<String>()
+                            }
+                            3 => {
+                                entry.nfkc = string
+                                    .split(' ')
+                                    .map(|v| {
+                                        u32::from_str_radix(v, 16).expect("valid u32 value as hex")
+                                    })
+                                    .map(|v| char::from_u32(v).expect("valid char"))
+                                    .collect::<String>()
+                            }
+                            4 => {
+                                entry.nfkd = string
+                                    .split(' ')
+                                    .map(|v| {
+                                        u32::from_str_radix(v, 16).expect("valid u32 value as hex")
+                                    })
+                                    .map(|v| char::from_u32(v).expect("valid char"))
+                                    .collect::<String>()
+                            }
+                            5 => {
+                                entry.comment =
+                                    string.trim_start_matches(['#', ' ']).trim_end().to_string()
+                            }
+                            _ => unreachable!(),
+                        }
+
+                        entry
+                    });
+
+            // c3 ==  toNFD(c1) ==  toNFD(c2) ==  toNFD(c3)
+            similar_asserts::assert_str_eq!(
+                nfd(&entry.source).collect::<String>(),
+                entry.nfd,
+                "c3 ==  toNFD(c1) at line {} # {}",
+                i + 1,
+                entry.comment
+            );
+            similar_asserts::assert_str_eq!(
+                nfd(&entry.nfc).collect::<String>(),
+                entry.nfd,
+                "c3 ==  toNFD(c2) at line {} # {}",
+                i + 1,
+                entry.comment
+            );
+            similar_asserts::assert_str_eq!(
+                nfd(&entry.nfd).collect::<String>(),
+                entry.nfd,
+                "c3 ==  toNFD(c3) at line {} # {}",
+                i + 1,
+                entry.comment
+            );
+
+            // c5 ==  toNFD(c4) ==  toNFD(c5)
+            similar_asserts::assert_str_eq!(
+                nfd(&entry.nfkc).collect::<String>(),
+                entry.nfkd,
+                "c5 ==  toNFD(c4) at line {} # {}",
+                i + 1,
+                entry.comment
+            );
+            similar_asserts::assert_str_eq!(
+                nfd(&entry.nfkd).collect::<String>(),
+                entry.nfkd,
+                "c5 ==  toNFD(c5) at line {} # {}",
+                i + 1,
+                entry.comment
+            );
+        }
+    }
+}
--- a/crates/u-norm/src/main.rs
+++ b/crates/u-norm/src/main.rs
@@ -0,0 +1,14 @@
+use u_norm::table;
+
+fn main() {
+    for c in '\x00'..='\x7f' {
+        let d = table::lookup(c);
+
+        println!(
+            "{:?} class: {}, decomp: {:?}",
+            c,
+            d.combining_class(),
+            d.decomposition().map(|d| d.collect::<Vec<_>>())
+        );
+    }
+}
--- a/crates/u-norm/src/table.rs
+++ b/crates/u-norm/src/table.rs
@@ -0,0 +1,118 @@
+use u_fst::raw::Fst;
+
+const TABLE: Fst<&'static [u8]> =
+    Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
+
+#[inline(always)]
+pub fn lookup(ch: char) -> Entry {
+    Entry::new(
+        TABLE
+            .get((ch as u32).to_ne_bytes())
+            .map(|output| output.value())
+            .unwrap_or(0),
+    )
+}
+
+#[derive(Clone, Copy, PartialEq, Debug)]
+pub struct Decomposition(u64);
+
+impl Iterator for Decomposition {
+    type Item = char;
+
+    #[inline(always)]
+    fn next(&mut self) -> Option<Self::Item> {
+        let d = (self.0 & 0x1FFFFF) as u32;
+
+        if d > 0 {
+            self.0 >>= 21;
+
+            Some(unsafe { char::from_u32_unchecked(d) })
+        } else {
+            None
+        }
+    }
+}
+
+#[derive(Clone, Copy, PartialEq, Debug)]
+pub struct Entry(u64);
+
+impl Entry {
+    pub(crate) fn new(data: u64) -> Self {
+        Self(data)
+    }
+
+    #[inline(always)]
+    pub fn combining_class(&self) -> u8 {
+        (self.0 & 0xFF) as u8
+    }
+
+    #[inline(always)]
+    pub fn decomposition(&self) -> Option<Decomposition> {
+        let data = self.0 >> 8;
+
+        if data > 0 {
+            Some(Decomposition(data))
+        } else {
+            None
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use proptest::prelude::*;
+
+    use super::*;
+
+    fn entry_strategy() -> impl Strategy<Value = (u64, (u8, u8, char, char))> {
+        (
+            any::<u8>(),
+            (0u8..2),
+            any::<char>().prop_filter("", |c| *c != '\u{0}'),
+            any::<char>().prop_filter("", |c| *c != '\u{0}'),
+        )
+            .prop_map(
+                |(combining_class, mapping_count, decomposition_first, decomposition_second)| {
+                    let mut entry = combining_class as u64;
+
+                    if mapping_count > 0 {
+                        entry |= (decomposition_first as u64) << 8;
+                    }
+
+                    if mapping_count > 1 {
+                        entry |= (decomposition_second as u64) << (21 + 8);
+                    }
+
+                    (
+                        entry,
+                        (
+                            combining_class,
+                            mapping_count,
+                            decomposition_first,
+                            decomposition_second,
+                        ),
+                    )
+                },
+            )
+    }
+
+    proptest! {
+        #[test]
+        fn proptest_entry_serialize_and_deserialize(a in entry_strategy()) {
+            let (data, (combining_class, mapping_count, decomposition_first, decomposition_second)) = a;
+
+            let b = Entry::new(data);
+
+            prop_assert_eq!(b.combining_class(), combining_class, "data = {:064b}", data);
+
+            let c = b.decomposition().map(|i| i.collect::<Vec<_>>());
+
+            match mapping_count {
+                0 => prop_assert_eq!(c, None, "data = {:064b}", data),
+                1 => prop_assert_eq!(c, Some(vec![decomposition_first]), "data = {:064b}", data),
+                2 => prop_assert_eq!(c, Some(vec![decomposition_first, decomposition_second]), "data = {:064b}", data),
+                _ => unreachable!(),
+            }
+        }
+    }
+}