Rename crates
This commit is contained in:
25
crates/u-norm/Cargo.toml
Normal file
25
crates/u-norm/Cargo.toml
Normal file
@@ -0,0 +1,25 @@
|
||||
[package]
|
||||
name = "u-norm"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[lib]
|
||||
bench = false
|
||||
|
||||
[[bench]]
|
||||
name = "bench"
|
||||
harness = false
|
||||
|
||||
[dependencies]
|
||||
fst = "0.4.7"
|
||||
tinyvec = { version = "1.6.0", features = ["alloc"] }
|
||||
u-fst = { path = "../u-fst" }
|
||||
|
||||
[build-dependencies]
|
||||
u-fst = { path = "../u-fst" }
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = "0.3.5"
|
||||
proptest = "1.0.0"
|
||||
similar-asserts = "1.2.0"
|
||||
unicode-normalization = "0.1.19"
|
||||
5
crates/u-norm/README.md
Normal file
5
crates/u-norm/README.md
Normal file
@@ -0,0 +1,5 @@
|
||||
# UNF
|
||||
|
||||
## Todo
|
||||
|
||||
- [ ] Change Decomposition to be `struct Decomposition(u64)` that implements `Iterator`
|
||||
29
crates/u-norm/benches/bench.rs
Normal file
29
crates/u-norm/benches/bench.rs
Normal file
@@ -0,0 +1,29 @@
|
||||
use std::fs;
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
|
||||
use u_norm::nfd;
|
||||
use unicode_normalization::UnicodeNormalization;
|
||||
|
||||
const ASCII: &str = "all types of normalized";
|
||||
|
||||
fn criterion_benchmark(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("ASCII");
|
||||
|
||||
group.bench_function("unf", |b| b.iter(|| nfd(ASCII).count()));
|
||||
group.bench_function("unicode-normalization", |b| b.iter(|| ASCII.nfd().count()));
|
||||
|
||||
group.finish();
|
||||
|
||||
let long = fs::read_to_string("benches/long.txt").unwrap();
|
||||
|
||||
let mut group = c.benchmark_group("Long");
|
||||
|
||||
group.bench_function("unf", |b| b.iter(|| nfd(&long).count()));
|
||||
group.bench_function("unicode-normalization", |b| b.iter(|| long.nfd().count()));
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(benches, criterion_benchmark);
|
||||
criterion_main!(benches);
|
||||
2366
crates/u-norm/benches/long.txt
Normal file
2366
crates/u-norm/benches/long.txt
Normal file
File diff suppressed because one or more lines are too long
66
crates/u-norm/build.rs
Normal file
66
crates/u-norm/build.rs
Normal file
@@ -0,0 +1,66 @@
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
use u_fst::raw::Fst;
|
||||
|
||||
fn main() {
|
||||
let data = fs::read_to_string("data/UnicodeData.txt").unwrap();
|
||||
|
||||
let out_dir = env::var_os("OUT_DIR").unwrap();
|
||||
let dest_path = Path::new(&out_dir).join("table.fst");
|
||||
|
||||
let mut entries = parse(&data)
|
||||
.into_iter()
|
||||
.map(|(code_value, entry)| (code_value.to_ne_bytes(), entry))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
entries.sort_unstable_by_key(|(k, _)| *k);
|
||||
|
||||
let data = Fst::from_iter_map(entries).unwrap().into_inner();
|
||||
|
||||
fs::write(&dest_path, data).unwrap();
|
||||
|
||||
println!("cargo:rerun-if-changed=data/UnicodeData.txt");
|
||||
println!("cargo:rerun-if-changed=build.rs");
|
||||
}
|
||||
|
||||
fn parse(data: &str) -> Vec<(u32, u64)> {
|
||||
let mut entries = Vec::new();
|
||||
|
||||
for line in data.lines() {
|
||||
let mut iter = line.split(';');
|
||||
|
||||
let code_point = iter
|
||||
.next()
|
||||
.map(|s| u32::from_str_radix(s, 16).expect("valid u32"))
|
||||
.expect("code value");
|
||||
|
||||
let combining_class = iter
|
||||
.nth(2)
|
||||
.map(|s| s.parse::<u8>().expect("valid u8"))
|
||||
.expect("canonical combining classes");
|
||||
|
||||
let mut entry = combining_class as u64;
|
||||
|
||||
let decomposition_mapping = iter.nth(1).unwrap();
|
||||
|
||||
if !decomposition_mapping.starts_with('<') {
|
||||
let mappings = decomposition_mapping
|
||||
.split(' ')
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(|s| u32::from_str_radix(s, 16).expect("valid u32"))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert!(mappings.len() <= 2);
|
||||
|
||||
for (i, mapping) in mappings.into_iter().enumerate() {
|
||||
entry |= (mapping as u64) << ((21 * i) + 8);
|
||||
}
|
||||
}
|
||||
|
||||
entries.push((code_point, entry));
|
||||
}
|
||||
|
||||
entries
|
||||
}
|
||||
9953
crates/u-norm/data/DerivedNormalizationProps.txt
Normal file
9953
crates/u-norm/data/DerivedNormalizationProps.txt
Normal file
File diff suppressed because it is too large
Load Diff
19047
crates/u-norm/data/NormalizationTest.txt
Normal file
19047
crates/u-norm/data/NormalizationTest.txt
Normal file
File diff suppressed because it is too large
Load Diff
1276
crates/u-norm/data/StandardizedVariants.txt
Normal file
1276
crates/u-norm/data/StandardizedVariants.txt
Normal file
File diff suppressed because it is too large
Load Diff
34626
crates/u-norm/data/UnicodeData.txt
Normal file
34626
crates/u-norm/data/UnicodeData.txt
Normal file
File diff suppressed because it is too large
Load Diff
305
crates/u-norm/src/lib.rs
Normal file
305
crates/u-norm/src/lib.rs
Normal file
@@ -0,0 +1,305 @@
|
||||
use std::iter::Fuse;
|
||||
use std::ops::Range;
|
||||
use std::str::Chars;
|
||||
|
||||
use tinyvec::TinyVec;
|
||||
|
||||
pub mod table;
|
||||
|
||||
pub fn nfd(s: &str) -> Decompositions<Chars<'_>> {
|
||||
Decompositions {
|
||||
iter: s.chars().fuse(),
|
||||
buffer: Buffer::new(),
|
||||
}
|
||||
}
|
||||
|
||||
struct Buffer {
|
||||
buffer: TinyVec<[(u8, char); 4]>,
|
||||
ready: Range<usize>,
|
||||
}
|
||||
|
||||
impl Buffer {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
buffer: TinyVec::new(),
|
||||
ready: 0..0,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn push_back(&mut self, ch: char) {
|
||||
let class = table::lookup(ch).combining_class();
|
||||
|
||||
if class == 0 {
|
||||
self.sort_pending();
|
||||
|
||||
self.buffer.push((class, ch));
|
||||
self.ready.end = self.buffer.len();
|
||||
} else {
|
||||
self.buffer.push((class, ch));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn sort_pending(&mut self) {
|
||||
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn reset(&mut self) {
|
||||
let pending = self.buffer.len() - self.ready.end;
|
||||
|
||||
for i in 0..pending {
|
||||
self.buffer[i] = self.buffer[i + self.ready.end];
|
||||
}
|
||||
|
||||
self.buffer.truncate(pending);
|
||||
self.ready = 0..0;
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn increment_next_ready(&mut self) {
|
||||
let next = self.ready.start + 1;
|
||||
|
||||
if next == self.ready.end {
|
||||
self.reset();
|
||||
} else {
|
||||
self.ready.start = next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Decompositions<I> {
|
||||
iter: Fuse<I>,
|
||||
buffer: Buffer,
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
|
||||
type Item = char;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
while self.buffer.ready.end == 0 {
|
||||
match self.iter.next() {
|
||||
Some(ch) => {
|
||||
decompose(ch, &mut self.buffer);
|
||||
}
|
||||
None => {
|
||||
if self.buffer.buffer.is_empty() {
|
||||
return None;
|
||||
} else {
|
||||
self.buffer.sort_pending();
|
||||
self.buffer.ready.end = self.buffer.buffer.len();
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let (_, ch) = self.buffer.buffer[self.buffer.ready.start];
|
||||
|
||||
self.buffer.increment_next_ready();
|
||||
|
||||
Some(ch)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let (lower, _) = self.iter.size_hint();
|
||||
|
||||
(lower, None)
|
||||
}
|
||||
}
|
||||
|
||||
// Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
|
||||
// http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
|
||||
const S_BASE: u32 = 0xAC00;
|
||||
const L_BASE: u32 = 0x1100;
|
||||
const V_BASE: u32 = 0x1161;
|
||||
const T_BASE: u32 = 0x11A7;
|
||||
const L_COUNT: u32 = 19;
|
||||
const V_COUNT: u32 = 21;
|
||||
const T_COUNT: u32 = 28;
|
||||
const N_COUNT: u32 = V_COUNT * T_COUNT;
|
||||
const S_COUNT: u32 = L_COUNT * N_COUNT;
|
||||
|
||||
fn decompose(c: char, buffer: &mut Buffer) {
|
||||
// 7-bit ASCII never decomposes
|
||||
if c <= '\x7f' {
|
||||
buffer.push_back(c);
|
||||
return;
|
||||
}
|
||||
|
||||
// Perform decomposition for Hangul
|
||||
if is_hangul_syllable(c) {
|
||||
decompose_hangul(c, buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
if let Some(decomposed) = table::lookup(c).decomposition() {
|
||||
for d in decomposed {
|
||||
decompose(d, buffer);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Finally bottom out.
|
||||
buffer.push_back(c);
|
||||
}
|
||||
|
||||
fn is_hangul_syllable(c: char) -> bool {
|
||||
(c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT)
|
||||
}
|
||||
|
||||
#[allow(unsafe_code)]
|
||||
fn decompose_hangul(s: char, buffer: &mut Buffer) {
|
||||
let s_index = s as u32 - S_BASE;
|
||||
let l_index = s_index / N_COUNT;
|
||||
|
||||
unsafe {
|
||||
buffer.push_back(char::from_u32_unchecked(L_BASE + l_index));
|
||||
|
||||
let v_index = (s_index % N_COUNT) / T_COUNT;
|
||||
buffer.push_back(char::from_u32_unchecked(V_BASE + v_index));
|
||||
|
||||
let t_index = s_index % T_COUNT;
|
||||
if t_index > 0 {
|
||||
buffer.push_back(char::from_u32_unchecked(T_BASE + t_index));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_unicode_normalization() {
|
||||
let data = File::open("data/NormalizationTest.txt")
|
||||
.map(BufReader::new)
|
||||
.expect("unicode normalization test file");
|
||||
|
||||
#[derive(Default)]
|
||||
struct Entry {
|
||||
source: String,
|
||||
nfc: String,
|
||||
nfd: String,
|
||||
nfkc: String,
|
||||
nfkd: String,
|
||||
comment: String,
|
||||
}
|
||||
|
||||
for (i, line) in data.lines().enumerate() {
|
||||
let line = line.expect("line");
|
||||
|
||||
if line.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if line.starts_with(['#', '@']) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let entry =
|
||||
line.splitn(6, ';')
|
||||
.enumerate()
|
||||
.fold(Entry::default(), |mut entry, (i, string)| {
|
||||
match i {
|
||||
0 => {
|
||||
entry.source = string
|
||||
.split(' ')
|
||||
.map(|v| {
|
||||
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
|
||||
})
|
||||
.map(|v| char::from_u32(v).expect("valid char"))
|
||||
.collect::<String>()
|
||||
}
|
||||
1 => {
|
||||
entry.nfc = string
|
||||
.split(' ')
|
||||
.map(|v| {
|
||||
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
|
||||
})
|
||||
.map(|v| char::from_u32(v).expect("valid char"))
|
||||
.collect::<String>()
|
||||
}
|
||||
2 => {
|
||||
entry.nfd = string
|
||||
.split(' ')
|
||||
.map(|v| {
|
||||
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
|
||||
})
|
||||
.map(|v| char::from_u32(v).expect("valid char"))
|
||||
.collect::<String>()
|
||||
}
|
||||
3 => {
|
||||
entry.nfkc = string
|
||||
.split(' ')
|
||||
.map(|v| {
|
||||
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
|
||||
})
|
||||
.map(|v| char::from_u32(v).expect("valid char"))
|
||||
.collect::<String>()
|
||||
}
|
||||
4 => {
|
||||
entry.nfkd = string
|
||||
.split(' ')
|
||||
.map(|v| {
|
||||
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
|
||||
})
|
||||
.map(|v| char::from_u32(v).expect("valid char"))
|
||||
.collect::<String>()
|
||||
}
|
||||
5 => {
|
||||
entry.comment =
|
||||
string.trim_start_matches(['#', ' ']).trim_end().to_string()
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
entry
|
||||
});
|
||||
|
||||
// c3 == toNFD(c1) == toNFD(c2) == toNFD(c3)
|
||||
similar_asserts::assert_str_eq!(
|
||||
nfd(&entry.source).collect::<String>(),
|
||||
entry.nfd,
|
||||
"c3 == toNFD(c1) at line {} # {}",
|
||||
i + 1,
|
||||
entry.comment
|
||||
);
|
||||
similar_asserts::assert_str_eq!(
|
||||
nfd(&entry.nfc).collect::<String>(),
|
||||
entry.nfd,
|
||||
"c3 == toNFD(c2) at line {} # {}",
|
||||
i + 1,
|
||||
entry.comment
|
||||
);
|
||||
similar_asserts::assert_str_eq!(
|
||||
nfd(&entry.nfd).collect::<String>(),
|
||||
entry.nfd,
|
||||
"c3 == toNFD(c3) at line {} # {}",
|
||||
i + 1,
|
||||
entry.comment
|
||||
);
|
||||
|
||||
// c5 == toNFD(c4) == toNFD(c5)
|
||||
similar_asserts::assert_str_eq!(
|
||||
nfd(&entry.nfkc).collect::<String>(),
|
||||
entry.nfkd,
|
||||
"c5 == toNFD(c4) at line {} # {}",
|
||||
i + 1,
|
||||
entry.comment
|
||||
);
|
||||
similar_asserts::assert_str_eq!(
|
||||
nfd(&entry.nfkd).collect::<String>(),
|
||||
entry.nfkd,
|
||||
"c5 == toNFD(c5) at line {} # {}",
|
||||
i + 1,
|
||||
entry.comment
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
14
crates/u-norm/src/main.rs
Normal file
14
crates/u-norm/src/main.rs
Normal file
@@ -0,0 +1,14 @@
|
||||
use u_norm::table;
|
||||
|
||||
fn main() {
|
||||
for c in '\x00'..='\x7f' {
|
||||
let d = table::lookup(c);
|
||||
|
||||
println!(
|
||||
"{:?} class: {}, decomp: {:?}",
|
||||
c,
|
||||
d.combining_class(),
|
||||
d.decomposition().map(|d| d.collect::<Vec<_>>())
|
||||
);
|
||||
}
|
||||
}
|
||||
118
crates/u-norm/src/table.rs
Normal file
118
crates/u-norm/src/table.rs
Normal file
@@ -0,0 +1,118 @@
|
||||
use u_fst::raw::Fst;
|
||||
|
||||
const TABLE: Fst<&'static [u8]> =
|
||||
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
|
||||
|
||||
#[inline(always)]
|
||||
pub fn lookup(ch: char) -> Entry {
|
||||
Entry::new(
|
||||
TABLE
|
||||
.get((ch as u32).to_ne_bytes())
|
||||
.map(|output| output.value())
|
||||
.unwrap_or(0),
|
||||
)
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Debug)]
|
||||
pub struct Decomposition(u64);
|
||||
|
||||
impl Iterator for Decomposition {
|
||||
type Item = char;
|
||||
|
||||
#[inline(always)]
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let d = (self.0 & 0x1FFFFF) as u32;
|
||||
|
||||
if d > 0 {
|
||||
self.0 >>= 21;
|
||||
|
||||
Some(unsafe { char::from_u32_unchecked(d) })
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Debug)]
|
||||
pub struct Entry(u64);
|
||||
|
||||
impl Entry {
|
||||
pub(crate) fn new(data: u64) -> Self {
|
||||
Self(data)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn combining_class(&self) -> u8 {
|
||||
(self.0 & 0xFF) as u8
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn decomposition(&self) -> Option<Decomposition> {
|
||||
let data = self.0 >> 8;
|
||||
|
||||
if data > 0 {
|
||||
Some(Decomposition(data))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use proptest::prelude::*;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn entry_strategy() -> impl Strategy<Value = (u64, (u8, u8, char, char))> {
|
||||
(
|
||||
any::<u8>(),
|
||||
(0u8..2),
|
||||
any::<char>().prop_filter("", |c| *c != '\u{0}'),
|
||||
any::<char>().prop_filter("", |c| *c != '\u{0}'),
|
||||
)
|
||||
.prop_map(
|
||||
|(combining_class, mapping_count, decomposition_first, decomposition_second)| {
|
||||
let mut entry = combining_class as u64;
|
||||
|
||||
if mapping_count > 0 {
|
||||
entry |= (decomposition_first as u64) << 8;
|
||||
}
|
||||
|
||||
if mapping_count > 1 {
|
||||
entry |= (decomposition_second as u64) << (21 + 8);
|
||||
}
|
||||
|
||||
(
|
||||
entry,
|
||||
(
|
||||
combining_class,
|
||||
mapping_count,
|
||||
decomposition_first,
|
||||
decomposition_second,
|
||||
),
|
||||
)
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn proptest_entry_serialize_and_deserialize(a in entry_strategy()) {
|
||||
let (data, (combining_class, mapping_count, decomposition_first, decomposition_second)) = a;
|
||||
|
||||
let b = Entry::new(data);
|
||||
|
||||
prop_assert_eq!(b.combining_class(), combining_class, "data = {:064b}", data);
|
||||
|
||||
let c = b.decomposition().map(|i| i.collect::<Vec<_>>());
|
||||
|
||||
match mapping_count {
|
||||
0 => prop_assert_eq!(c, None, "data = {:064b}", data),
|
||||
1 => prop_assert_eq!(c, Some(vec![decomposition_first]), "data = {:064b}", data),
|
||||
2 => prop_assert_eq!(c, Some(vec![decomposition_first, decomposition_second]), "data = {:064b}", data),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user