Initial commit
This commit is contained in:
16
crates/unf/Cargo.toml
Normal file
16
crates/unf/Cargo.toml
Normal file
@@ -0,0 +1,16 @@
|
||||
[package]
|
||||
name = "unf"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
fst = "0.4.7"
|
||||
tinyvec = { version = "1.6.0", features = ["alloc"] }
|
||||
ufst = { path = "../ufst" }
|
||||
|
||||
[build-dependencies]
|
||||
ufst = { path = "../ufst" }
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1.0.0"
|
||||
similar-asserts = "1.2.0"
|
||||
66
crates/unf/build.rs
Normal file
66
crates/unf/build.rs
Normal file
@@ -0,0 +1,66 @@
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
use ufst::raw::Fst;
|
||||
|
||||
fn main() {
|
||||
let data = fs::read_to_string("data/UnicodeData.txt").unwrap();
|
||||
|
||||
let out_dir = env::var_os("OUT_DIR").unwrap();
|
||||
let dest_path = Path::new(&out_dir).join("table.fst");
|
||||
|
||||
let mut entries = parse(&data)
|
||||
.into_iter()
|
||||
.map(|(code_value, entry)| (code_value.to_ne_bytes(), entry))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
entries.sort_unstable_by_key(|(k, _)| *k);
|
||||
|
||||
let data = Fst::from_iter_map(entries).unwrap().into_inner();
|
||||
|
||||
fs::write(&dest_path, data).unwrap();
|
||||
|
||||
println!("cargo:rerun-if-changed=data/UnicodeData.txt");
|
||||
println!("cargo:rerun-if-changed=build.rs");
|
||||
}
|
||||
|
||||
fn parse(data: &str) -> Vec<(u32, u64)> {
|
||||
let mut entries = Vec::new();
|
||||
|
||||
for line in data.lines() {
|
||||
let mut iter = line.split(';');
|
||||
|
||||
let code_point = iter
|
||||
.next()
|
||||
.map(|s| u32::from_str_radix(s, 16).expect("valid u32"))
|
||||
.expect("code value");
|
||||
|
||||
let combining_class = iter
|
||||
.nth(2)
|
||||
.map(|s| s.parse::<u8>().expect("valid u8"))
|
||||
.expect("canonical combining classes");
|
||||
|
||||
let mut entry = combining_class as u64;
|
||||
|
||||
let decomposition_mapping = iter.nth(1).unwrap();
|
||||
|
||||
if !decomposition_mapping.starts_with('<') {
|
||||
let mappings = decomposition_mapping
|
||||
.split(' ')
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(|s| u32::from_str_radix(s, 16).expect("valid u32"))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert!(mappings.len() <= 2);
|
||||
|
||||
for (i, mapping) in mappings.into_iter().enumerate() {
|
||||
entry |= (mapping as u64) << ((21 * i) + 8);
|
||||
}
|
||||
}
|
||||
|
||||
entries.push((code_point, entry));
|
||||
}
|
||||
|
||||
entries
|
||||
}
|
||||
9953
crates/unf/data/DerivedNormalizationProps.txt
Normal file
9953
crates/unf/data/DerivedNormalizationProps.txt
Normal file
File diff suppressed because it is too large
Load Diff
19047
crates/unf/data/NormalizationTest.txt
Normal file
19047
crates/unf/data/NormalizationTest.txt
Normal file
File diff suppressed because it is too large
Load Diff
1276
crates/unf/data/StandardizedVariants.txt
Normal file
1276
crates/unf/data/StandardizedVariants.txt
Normal file
File diff suppressed because it is too large
Load Diff
34626
crates/unf/data/UnicodeData.txt
Normal file
34626
crates/unf/data/UnicodeData.txt
Normal file
File diff suppressed because it is too large
Load Diff
309
crates/unf/src/lib.rs
Normal file
309
crates/unf/src/lib.rs
Normal file
@@ -0,0 +1,309 @@
|
||||
use std::iter::Fuse;
|
||||
use std::ops::Range;
|
||||
use std::str::Chars;
|
||||
|
||||
use tinyvec::TinyVec;
|
||||
|
||||
pub mod table;
|
||||
|
||||
use table::Decomposition;
|
||||
|
||||
pub fn nfd(s: &str) -> Decompositions<Chars<'_>> {
|
||||
Decompositions {
|
||||
iter: s.chars().fuse(),
|
||||
buffer: Buffer::new(),
|
||||
}
|
||||
}
|
||||
|
||||
struct Buffer {
|
||||
buffer: TinyVec<[(u8, char); 4]>,
|
||||
ready: Range<usize>,
|
||||
}
|
||||
|
||||
impl Buffer {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
buffer: TinyVec::new(),
|
||||
ready: 0..0,
|
||||
}
|
||||
}
|
||||
|
||||
fn push_back(&mut self, ch: char) {
|
||||
let class = table::lookup(ch).combining_class();
|
||||
|
||||
if class == 0 {
|
||||
self.sort_pending();
|
||||
|
||||
self.buffer.push((class, ch));
|
||||
self.ready.end = self.buffer.len();
|
||||
} else {
|
||||
self.buffer.push((class, ch));
|
||||
}
|
||||
}
|
||||
|
||||
fn sort_pending(&mut self) {
|
||||
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
|
||||
}
|
||||
|
||||
fn reset(&mut self) {
|
||||
let pending = self.buffer.len() - self.ready.end;
|
||||
|
||||
for i in 0..pending {
|
||||
self.buffer[i] = self.buffer[i + self.ready.end];
|
||||
}
|
||||
|
||||
self.buffer.truncate(pending);
|
||||
self.ready = 0..0;
|
||||
}
|
||||
|
||||
fn increment_next_ready(&mut self) {
|
||||
let next = self.ready.start + 1;
|
||||
|
||||
if next == self.ready.end {
|
||||
self.reset();
|
||||
} else {
|
||||
self.ready.start = next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Decompositions<I> {
|
||||
iter: Fuse<I>,
|
||||
buffer: Buffer,
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
|
||||
type Item = char;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
while self.buffer.ready.end == 0 {
|
||||
match self.iter.next() {
|
||||
Some(ch) => {
|
||||
decompose(ch, &mut self.buffer);
|
||||
}
|
||||
None => {
|
||||
if self.buffer.buffer.is_empty() {
|
||||
return None;
|
||||
} else {
|
||||
self.buffer.sort_pending();
|
||||
self.buffer.ready.end = self.buffer.buffer.len();
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let (_, ch) = self.buffer.buffer[self.buffer.ready.start];
|
||||
|
||||
self.buffer.increment_next_ready();
|
||||
|
||||
Some(ch)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let (lower, _) = self.iter.size_hint();
|
||||
|
||||
(lower, None)
|
||||
}
|
||||
}
|
||||
|
||||
// Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
|
||||
// http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
|
||||
const S_BASE: u32 = 0xAC00;
|
||||
const L_BASE: u32 = 0x1100;
|
||||
const V_BASE: u32 = 0x1161;
|
||||
const T_BASE: u32 = 0x11A7;
|
||||
const L_COUNT: u32 = 19;
|
||||
const V_COUNT: u32 = 21;
|
||||
const T_COUNT: u32 = 28;
|
||||
const N_COUNT: u32 = V_COUNT * T_COUNT;
|
||||
const S_COUNT: u32 = L_COUNT * N_COUNT;
|
||||
|
||||
fn decompose(c: char, buffer: &mut Buffer) {
|
||||
// 7-bit ASCII never decomposes
|
||||
if c <= '\x7f' {
|
||||
buffer.push_back(c);
|
||||
return;
|
||||
}
|
||||
|
||||
// Perform decomposition for Hangul
|
||||
if is_hangul_syllable(c) {
|
||||
decompose_hangul(c, buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
if let Some(decomposed) = table::lookup(c).decomposition() {
|
||||
match decomposed {
|
||||
Decomposition::Single(f) => {
|
||||
decompose(f, buffer);
|
||||
}
|
||||
Decomposition::Double(f, s) => {
|
||||
decompose(f, buffer);
|
||||
decompose(s, buffer);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Finally bottom out.
|
||||
buffer.push_back(c);
|
||||
}
|
||||
|
||||
fn is_hangul_syllable(c: char) -> bool {
|
||||
(c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT)
|
||||
}
|
||||
|
||||
#[allow(unsafe_code)]
|
||||
fn decompose_hangul(s: char, buffer: &mut Buffer) {
|
||||
let s_index = s as u32 - S_BASE;
|
||||
let l_index = s_index / N_COUNT;
|
||||
|
||||
unsafe {
|
||||
buffer.push_back(char::from_u32_unchecked(L_BASE + l_index));
|
||||
|
||||
let v_index = (s_index % N_COUNT) / T_COUNT;
|
||||
buffer.push_back(char::from_u32_unchecked(V_BASE + v_index));
|
||||
|
||||
let t_index = s_index % T_COUNT;
|
||||
if t_index > 0 {
|
||||
buffer.push_back(char::from_u32_unchecked(T_BASE + t_index));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_unicode_normalization() {
|
||||
let data = File::open("data/NormalizationTest.txt")
|
||||
.map(BufReader::new)
|
||||
.expect("unicode normalization test file");
|
||||
|
||||
#[derive(Default)]
|
||||
struct Entry {
|
||||
source: String,
|
||||
nfc: String,
|
||||
nfd: String,
|
||||
nfkc: String,
|
||||
nfkd: String,
|
||||
comment: String,
|
||||
}
|
||||
|
||||
for (i, line) in data.lines().enumerate() {
|
||||
let line = line.expect("line");
|
||||
|
||||
if line.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if line.starts_with(['#', '@']) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let entry =
|
||||
line.splitn(6, ';')
|
||||
.enumerate()
|
||||
.fold(Entry::default(), |mut entry, (i, string)| {
|
||||
match i {
|
||||
0 => {
|
||||
entry.source = string
|
||||
.split(' ')
|
||||
.map(|v| {
|
||||
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
|
||||
})
|
||||
.map(|v| char::from_u32(v).expect("valid char"))
|
||||
.collect::<String>()
|
||||
}
|
||||
1 => {
|
||||
entry.nfc = string
|
||||
.split(' ')
|
||||
.map(|v| {
|
||||
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
|
||||
})
|
||||
.map(|v| char::from_u32(v).expect("valid char"))
|
||||
.collect::<String>()
|
||||
}
|
||||
2 => {
|
||||
entry.nfd = string
|
||||
.split(' ')
|
||||
.map(|v| {
|
||||
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
|
||||
})
|
||||
.map(|v| char::from_u32(v).expect("valid char"))
|
||||
.collect::<String>()
|
||||
}
|
||||
3 => {
|
||||
entry.nfkc = string
|
||||
.split(' ')
|
||||
.map(|v| {
|
||||
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
|
||||
})
|
||||
.map(|v| char::from_u32(v).expect("valid char"))
|
||||
.collect::<String>()
|
||||
}
|
||||
4 => {
|
||||
entry.nfkd = string
|
||||
.split(' ')
|
||||
.map(|v| {
|
||||
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
|
||||
})
|
||||
.map(|v| char::from_u32(v).expect("valid char"))
|
||||
.collect::<String>()
|
||||
}
|
||||
5 => {
|
||||
entry.comment =
|
||||
string.trim_start_matches(['#', ' ']).trim_end().to_string()
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
entry
|
||||
});
|
||||
|
||||
// c3 == toNFD(c1) == toNFD(c2) == toNFD(c3)
|
||||
similar_asserts::assert_str_eq!(
|
||||
nfd(&entry.source).collect::<String>(),
|
||||
entry.nfd,
|
||||
"c3 == toNFD(c1) at line {} # {}",
|
||||
i + 1,
|
||||
entry.comment
|
||||
);
|
||||
similar_asserts::assert_str_eq!(
|
||||
nfd(&entry.nfc).collect::<String>(),
|
||||
entry.nfd,
|
||||
"c3 == toNFD(c2) at line {} # {}",
|
||||
i + 1,
|
||||
entry.comment
|
||||
);
|
||||
similar_asserts::assert_str_eq!(
|
||||
nfd(&entry.nfd).collect::<String>(),
|
||||
entry.nfd,
|
||||
"c3 == toNFD(c3) at line {} # {}",
|
||||
i + 1,
|
||||
entry.comment
|
||||
);
|
||||
|
||||
// c5 == toNFD(c4) == toNFD(c5)
|
||||
similar_asserts::assert_str_eq!(
|
||||
nfd(&entry.nfkc).collect::<String>(),
|
||||
entry.nfkd,
|
||||
"c5 == toNFD(c4) at line {} # {}",
|
||||
i + 1,
|
||||
entry.comment
|
||||
);
|
||||
similar_asserts::assert_str_eq!(
|
||||
nfd(&entry.nfkd).collect::<String>(),
|
||||
entry.nfkd,
|
||||
"c5 == toNFD(c5) at line {} # {}",
|
||||
i + 1,
|
||||
entry.comment
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
114
crates/unf/src/table.rs
Normal file
114
crates/unf/src/table.rs
Normal file
@@ -0,0 +1,114 @@
|
||||
use ufst::raw::Fst;
|
||||
|
||||
const TABLE: Fst<&'static [u8]> =
|
||||
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
|
||||
|
||||
pub fn lookup(ch: char) -> Entry {
|
||||
Entry::new(
|
||||
TABLE
|
||||
.get((ch as u32).to_ne_bytes())
|
||||
.map(|output| output.value())
|
||||
.unwrap_or(0),
|
||||
)
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Debug)]
|
||||
pub enum Decomposition {
|
||||
Single(char),
|
||||
Double(char, char),
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Debug)]
|
||||
pub struct Entry(u64);
|
||||
|
||||
impl Entry {
|
||||
pub(crate) fn new(data: u64) -> Self {
|
||||
Self(data)
|
||||
}
|
||||
|
||||
pub fn combining_class(&self) -> u8 {
|
||||
(self.0 & 0xFF) as u8
|
||||
}
|
||||
|
||||
pub(crate) fn decomposition(&self) -> Option<Decomposition> {
|
||||
let m1 = ((self.0 >> 8) & 0x1FFFFF) as u32;
|
||||
|
||||
if m1 > 0 {
|
||||
let m2 = ((self.0 >> 29) & 0x1FFFFF) as u32;
|
||||
|
||||
if m2 > 0 {
|
||||
unsafe {
|
||||
Some(Decomposition::Double(
|
||||
char::from_u32_unchecked(m1),
|
||||
char::from_u32_unchecked(m2),
|
||||
))
|
||||
}
|
||||
} else {
|
||||
unsafe { Some(Decomposition::Single(char::from_u32_unchecked(m1))) }
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use proptest::prelude::*;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn entry_strategy() -> impl Strategy<Value = (u64, (u8, u8, char, char))> {
|
||||
(
|
||||
any::<u8>(),
|
||||
(0u8..2),
|
||||
any::<char>().prop_filter("", |c| *c != '\u{0}'),
|
||||
any::<char>().prop_filter("", |c| *c != '\u{0}'),
|
||||
)
|
||||
.prop_map(
|
||||
|(combining_class, mapping_count, decomposition_first, decomposition_second)| {
|
||||
let mut entry = combining_class as u64;
|
||||
|
||||
if mapping_count > 0 {
|
||||
entry |= (decomposition_first as u64) << 8;
|
||||
}
|
||||
|
||||
if mapping_count > 1 {
|
||||
entry |= (decomposition_second as u64) << (21 + 8);
|
||||
}
|
||||
|
||||
(
|
||||
entry,
|
||||
(
|
||||
combining_class,
|
||||
mapping_count,
|
||||
decomposition_first,
|
||||
decomposition_second,
|
||||
),
|
||||
)
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn proptest_entry_serialize_and_deserialize(a in entry_strategy()) {
|
||||
let (data, (combining_class, mapping_count, decomposition_first, decomposition_second)) = a;
|
||||
|
||||
let b = Entry::new(data);
|
||||
|
||||
prop_assert_eq!(b.combining_class(), combining_class, "data = {:064b}", data);
|
||||
|
||||
match mapping_count {
|
||||
0 => prop_assert_eq!(b.decomposition(), None, "data = {:064b}", data),
|
||||
1 => prop_assert_eq!(b.decomposition(), Some(Decomposition::Single(decomposition_first)), "data = {:064b}", data),
|
||||
2 => prop_assert_eq!(b.decomposition(), Some(Decomposition::Double(decomposition_first, decomposition_second)), "data = {:064b}", data),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
|
||||
|
||||
// prop_assert_eq!(a, b, "data = {:064b}", data);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user