Rename crates

This commit is contained in:
2022-05-24 20:58:27 +02:00
parent 8a8baffba8
commit 9f44196e6c
51 changed files with 2531 additions and 54 deletions

25
crates/u-norm/Cargo.toml Normal file
View File

@@ -0,0 +1,25 @@
[package]
name = "u-norm"
version = "0.1.0"
edition = "2021"
[lib]
bench = false
[[bench]]
name = "bench"
harness = false
[dependencies]
fst = "0.4.7"
tinyvec = { version = "1.6.0", features = ["alloc"] }
u-fst = { path = "../u-fst" }
[build-dependencies]
u-fst = { path = "../u-fst" }
[dev-dependencies]
criterion = "0.3.5"
proptest = "1.0.0"
similar-asserts = "1.2.0"
unicode-normalization = "0.1.19"

5
crates/u-norm/README.md Normal file
View File

@@ -0,0 +1,5 @@
# UNF
## Todo
- [ ] Change Decomposition to be `struct Decomposition(u64)` that implements `Iterator`

View File

@@ -0,0 +1,29 @@
use std::fs;
use criterion::{criterion_group, criterion_main, Criterion};
use u_norm::nfd;
use unicode_normalization::UnicodeNormalization;
const ASCII: &str = "all types of normalized";
fn criterion_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("ASCII");
group.bench_function("unf", |b| b.iter(|| nfd(ASCII).count()));
group.bench_function("unicode-normalization", |b| b.iter(|| ASCII.nfd().count()));
group.finish();
let long = fs::read_to_string("benches/long.txt").unwrap();
let mut group = c.benchmark_group("Long");
group.bench_function("unf", |b| b.iter(|| nfd(&long).count()));
group.bench_function("unicode-normalization", |b| b.iter(|| long.nfd().count()));
group.finish();
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

File diff suppressed because one or more lines are too long

66
crates/u-norm/build.rs Normal file
View File

@@ -0,0 +1,66 @@
use std::env;
use std::fs;
use std::path::Path;
use u_fst::raw::Fst;
fn main() {
let data = fs::read_to_string("data/UnicodeData.txt").unwrap();
let out_dir = env::var_os("OUT_DIR").unwrap();
let dest_path = Path::new(&out_dir).join("table.fst");
let mut entries = parse(&data)
.into_iter()
.map(|(code_value, entry)| (code_value.to_ne_bytes(), entry))
.collect::<Vec<_>>();
entries.sort_unstable_by_key(|(k, _)| *k);
let data = Fst::from_iter_map(entries).unwrap().into_inner();
fs::write(&dest_path, data).unwrap();
println!("cargo:rerun-if-changed=data/UnicodeData.txt");
println!("cargo:rerun-if-changed=build.rs");
}
fn parse(data: &str) -> Vec<(u32, u64)> {
let mut entries = Vec::new();
for line in data.lines() {
let mut iter = line.split(';');
let code_point = iter
.next()
.map(|s| u32::from_str_radix(s, 16).expect("valid u32"))
.expect("code value");
let combining_class = iter
.nth(2)
.map(|s| s.parse::<u8>().expect("valid u8"))
.expect("canonical combining classes");
let mut entry = combining_class as u64;
let decomposition_mapping = iter.nth(1).unwrap();
if !decomposition_mapping.starts_with('<') {
let mappings = decomposition_mapping
.split(' ')
.filter(|s| !s.is_empty())
.map(|s| u32::from_str_radix(s, 16).expect("valid u32"))
.collect::<Vec<_>>();
assert!(mappings.len() <= 2);
for (i, mapping) in mappings.into_iter().enumerate() {
entry |= (mapping as u64) << ((21 * i) + 8);
}
}
entries.push((code_point, entry));
}
entries
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

305
crates/u-norm/src/lib.rs Normal file
View File

@@ -0,0 +1,305 @@
use std::iter::Fuse;
use std::ops::Range;
use std::str::Chars;
use tinyvec::TinyVec;
pub mod table;
pub fn nfd(s: &str) -> Decompositions<Chars<'_>> {
Decompositions {
iter: s.chars().fuse(),
buffer: Buffer::new(),
}
}
struct Buffer {
buffer: TinyVec<[(u8, char); 4]>,
ready: Range<usize>,
}
impl Buffer {
fn new() -> Self {
Self {
buffer: TinyVec::new(),
ready: 0..0,
}
}
#[inline(always)]
fn push_back(&mut self, ch: char) {
let class = table::lookup(ch).combining_class();
if class == 0 {
self.sort_pending();
self.buffer.push((class, ch));
self.ready.end = self.buffer.len();
} else {
self.buffer.push((class, ch));
}
}
#[inline(always)]
fn sort_pending(&mut self) {
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
}
#[inline(always)]
fn reset(&mut self) {
let pending = self.buffer.len() - self.ready.end;
for i in 0..pending {
self.buffer[i] = self.buffer[i + self.ready.end];
}
self.buffer.truncate(pending);
self.ready = 0..0;
}
#[inline(always)]
fn increment_next_ready(&mut self) {
let next = self.ready.start + 1;
if next == self.ready.end {
self.reset();
} else {
self.ready.start = next;
}
}
}
pub struct Decompositions<I> {
iter: Fuse<I>,
buffer: Buffer,
}
impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
type Item = char;
fn next(&mut self) -> Option<Self::Item> {
while self.buffer.ready.end == 0 {
match self.iter.next() {
Some(ch) => {
decompose(ch, &mut self.buffer);
}
None => {
if self.buffer.buffer.is_empty() {
return None;
} else {
self.buffer.sort_pending();
self.buffer.ready.end = self.buffer.buffer.len();
break;
}
}
}
}
let (_, ch) = self.buffer.buffer[self.buffer.ready.start];
self.buffer.increment_next_ready();
Some(ch)
}
fn size_hint(&self) -> (usize, Option<usize>) {
let (lower, _) = self.iter.size_hint();
(lower, None)
}
}
// Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
// http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
const S_BASE: u32 = 0xAC00;
const L_BASE: u32 = 0x1100;
const V_BASE: u32 = 0x1161;
const T_BASE: u32 = 0x11A7;
const L_COUNT: u32 = 19;
const V_COUNT: u32 = 21;
const T_COUNT: u32 = 28;
const N_COUNT: u32 = V_COUNT * T_COUNT;
const S_COUNT: u32 = L_COUNT * N_COUNT;
fn decompose(c: char, buffer: &mut Buffer) {
// 7-bit ASCII never decomposes
if c <= '\x7f' {
buffer.push_back(c);
return;
}
// Perform decomposition for Hangul
if is_hangul_syllable(c) {
decompose_hangul(c, buffer);
return;
}
if let Some(decomposed) = table::lookup(c).decomposition() {
for d in decomposed {
decompose(d, buffer);
}
return;
}
// Finally bottom out.
buffer.push_back(c);
}
fn is_hangul_syllable(c: char) -> bool {
(c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT)
}
#[allow(unsafe_code)]
fn decompose_hangul(s: char, buffer: &mut Buffer) {
let s_index = s as u32 - S_BASE;
let l_index = s_index / N_COUNT;
unsafe {
buffer.push_back(char::from_u32_unchecked(L_BASE + l_index));
let v_index = (s_index % N_COUNT) / T_COUNT;
buffer.push_back(char::from_u32_unchecked(V_BASE + v_index));
let t_index = s_index % T_COUNT;
if t_index > 0 {
buffer.push_back(char::from_u32_unchecked(T_BASE + t_index));
}
}
}
#[cfg(test)]
mod tests {
use std::fs::File;
use std::io::{BufRead, BufReader};
use super::*;
#[test]
fn test_unicode_normalization() {
let data = File::open("data/NormalizationTest.txt")
.map(BufReader::new)
.expect("unicode normalization test file");
#[derive(Default)]
struct Entry {
source: String,
nfc: String,
nfd: String,
nfkc: String,
nfkd: String,
comment: String,
}
for (i, line) in data.lines().enumerate() {
let line = line.expect("line");
if line.is_empty() {
continue;
}
if line.starts_with(['#', '@']) {
continue;
}
let entry =
line.splitn(6, ';')
.enumerate()
.fold(Entry::default(), |mut entry, (i, string)| {
match i {
0 => {
entry.source = string
.split(' ')
.map(|v| {
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
})
.map(|v| char::from_u32(v).expect("valid char"))
.collect::<String>()
}
1 => {
entry.nfc = string
.split(' ')
.map(|v| {
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
})
.map(|v| char::from_u32(v).expect("valid char"))
.collect::<String>()
}
2 => {
entry.nfd = string
.split(' ')
.map(|v| {
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
})
.map(|v| char::from_u32(v).expect("valid char"))
.collect::<String>()
}
3 => {
entry.nfkc = string
.split(' ')
.map(|v| {
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
})
.map(|v| char::from_u32(v).expect("valid char"))
.collect::<String>()
}
4 => {
entry.nfkd = string
.split(' ')
.map(|v| {
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
})
.map(|v| char::from_u32(v).expect("valid char"))
.collect::<String>()
}
5 => {
entry.comment =
string.trim_start_matches(['#', ' ']).trim_end().to_string()
}
_ => unreachable!(),
}
entry
});
// c3 == toNFD(c1) == toNFD(c2) == toNFD(c3)
similar_asserts::assert_str_eq!(
nfd(&entry.source).collect::<String>(),
entry.nfd,
"c3 == toNFD(c1) at line {} # {}",
i + 1,
entry.comment
);
similar_asserts::assert_str_eq!(
nfd(&entry.nfc).collect::<String>(),
entry.nfd,
"c3 == toNFD(c2) at line {} # {}",
i + 1,
entry.comment
);
similar_asserts::assert_str_eq!(
nfd(&entry.nfd).collect::<String>(),
entry.nfd,
"c3 == toNFD(c3) at line {} # {}",
i + 1,
entry.comment
);
// c5 == toNFD(c4) == toNFD(c5)
similar_asserts::assert_str_eq!(
nfd(&entry.nfkc).collect::<String>(),
entry.nfkd,
"c5 == toNFD(c4) at line {} # {}",
i + 1,
entry.comment
);
similar_asserts::assert_str_eq!(
nfd(&entry.nfkd).collect::<String>(),
entry.nfkd,
"c5 == toNFD(c5) at line {} # {}",
i + 1,
entry.comment
);
}
}
}

14
crates/u-norm/src/main.rs Normal file
View File

@@ -0,0 +1,14 @@
use u_norm::table;
fn main() {
for c in '\x00'..='\x7f' {
let d = table::lookup(c);
println!(
"{:?} class: {}, decomp: {:?}",
c,
d.combining_class(),
d.decomposition().map(|d| d.collect::<Vec<_>>())
);
}
}

118
crates/u-norm/src/table.rs Normal file
View File

@@ -0,0 +1,118 @@
use u_fst::raw::Fst;
const TABLE: Fst<&'static [u8]> =
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
#[inline(always)]
pub fn lookup(ch: char) -> Entry {
Entry::new(
TABLE
.get((ch as u32).to_ne_bytes())
.map(|output| output.value())
.unwrap_or(0),
)
}
#[derive(Clone, Copy, PartialEq, Debug)]
pub struct Decomposition(u64);
impl Iterator for Decomposition {
type Item = char;
#[inline(always)]
fn next(&mut self) -> Option<Self::Item> {
let d = (self.0 & 0x1FFFFF) as u32;
if d > 0 {
self.0 >>= 21;
Some(unsafe { char::from_u32_unchecked(d) })
} else {
None
}
}
}
#[derive(Clone, Copy, PartialEq, Debug)]
pub struct Entry(u64);
impl Entry {
pub(crate) fn new(data: u64) -> Self {
Self(data)
}
#[inline(always)]
pub fn combining_class(&self) -> u8 {
(self.0 & 0xFF) as u8
}
#[inline(always)]
pub fn decomposition(&self) -> Option<Decomposition> {
let data = self.0 >> 8;
if data > 0 {
Some(Decomposition(data))
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use proptest::prelude::*;
use super::*;
fn entry_strategy() -> impl Strategy<Value = (u64, (u8, u8, char, char))> {
(
any::<u8>(),
(0u8..2),
any::<char>().prop_filter("", |c| *c != '\u{0}'),
any::<char>().prop_filter("", |c| *c != '\u{0}'),
)
.prop_map(
|(combining_class, mapping_count, decomposition_first, decomposition_second)| {
let mut entry = combining_class as u64;
if mapping_count > 0 {
entry |= (decomposition_first as u64) << 8;
}
if mapping_count > 1 {
entry |= (decomposition_second as u64) << (21 + 8);
}
(
entry,
(
combining_class,
mapping_count,
decomposition_first,
decomposition_second,
),
)
},
)
}
proptest! {
#[test]
fn proptest_entry_serialize_and_deserialize(a in entry_strategy()) {
let (data, (combining_class, mapping_count, decomposition_first, decomposition_second)) = a;
let b = Entry::new(data);
prop_assert_eq!(b.combining_class(), combining_class, "data = {:064b}", data);
let c = b.decomposition().map(|i| i.collect::<Vec<_>>());
match mapping_count {
0 => prop_assert_eq!(c, None, "data = {:064b}", data),
1 => prop_assert_eq!(c, Some(vec![decomposition_first]), "data = {:064b}", data),
2 => prop_assert_eq!(c, Some(vec![decomposition_first, decomposition_second]), "data = {:064b}", data),
_ => unreachable!(),
}
}
}
}