Initial commit

This commit is contained in:
2022-05-19 23:26:00 +02:00
commit 8a8baffba8
53 changed files with 761345 additions and 0 deletions

16
crates/unf/Cargo.toml Normal file
View File

@@ -0,0 +1,16 @@
[package]
name = "unf"
version = "0.1.0"
edition = "2021"
[dependencies]
fst = "0.4.7"
tinyvec = { version = "1.6.0", features = ["alloc"] }
ufst = { path = "../ufst" }
[build-dependencies]
ufst = { path = "../ufst" }
[dev-dependencies]
proptest = "1.0.0"
similar-asserts = "1.2.0"

66
crates/unf/build.rs Normal file
View File

@@ -0,0 +1,66 @@
use std::env;
use std::fs;
use std::path::Path;
use ufst::raw::Fst;
fn main() {
let data = fs::read_to_string("data/UnicodeData.txt").unwrap();
let out_dir = env::var_os("OUT_DIR").unwrap();
let dest_path = Path::new(&out_dir).join("table.fst");
let mut entries = parse(&data)
.into_iter()
.map(|(code_value, entry)| (code_value.to_ne_bytes(), entry))
.collect::<Vec<_>>();
entries.sort_unstable_by_key(|(k, _)| *k);
let data = Fst::from_iter_map(entries).unwrap().into_inner();
fs::write(&dest_path, data).unwrap();
println!("cargo:rerun-if-changed=data/UnicodeData.txt");
println!("cargo:rerun-if-changed=build.rs");
}
fn parse(data: &str) -> Vec<(u32, u64)> {
let mut entries = Vec::new();
for line in data.lines() {
let mut iter = line.split(';');
let code_point = iter
.next()
.map(|s| u32::from_str_radix(s, 16).expect("valid u32"))
.expect("code value");
let combining_class = iter
.nth(2)
.map(|s| s.parse::<u8>().expect("valid u8"))
.expect("canonical combining classes");
let mut entry = combining_class as u64;
let decomposition_mapping = iter.nth(1).unwrap();
if !decomposition_mapping.starts_with('<') {
let mappings = decomposition_mapping
.split(' ')
.filter(|s| !s.is_empty())
.map(|s| u32::from_str_radix(s, 16).expect("valid u32"))
.collect::<Vec<_>>();
assert!(mappings.len() <= 2);
for (i, mapping) in mappings.into_iter().enumerate() {
entry |= (mapping as u64) << ((21 * i) + 8);
}
}
entries.push((code_point, entry));
}
entries
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

309
crates/unf/src/lib.rs Normal file
View File

@@ -0,0 +1,309 @@
use std::iter::Fuse;
use std::ops::Range;
use std::str::Chars;
use tinyvec::TinyVec;
pub mod table;
use table::Decomposition;
pub fn nfd(s: &str) -> Decompositions<Chars<'_>> {
Decompositions {
iter: s.chars().fuse(),
buffer: Buffer::new(),
}
}
struct Buffer {
buffer: TinyVec<[(u8, char); 4]>,
ready: Range<usize>,
}
impl Buffer {
fn new() -> Self {
Self {
buffer: TinyVec::new(),
ready: 0..0,
}
}
fn push_back(&mut self, ch: char) {
let class = table::lookup(ch).combining_class();
if class == 0 {
self.sort_pending();
self.buffer.push((class, ch));
self.ready.end = self.buffer.len();
} else {
self.buffer.push((class, ch));
}
}
fn sort_pending(&mut self) {
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
}
fn reset(&mut self) {
let pending = self.buffer.len() - self.ready.end;
for i in 0..pending {
self.buffer[i] = self.buffer[i + self.ready.end];
}
self.buffer.truncate(pending);
self.ready = 0..0;
}
fn increment_next_ready(&mut self) {
let next = self.ready.start + 1;
if next == self.ready.end {
self.reset();
} else {
self.ready.start = next;
}
}
}
pub struct Decompositions<I> {
iter: Fuse<I>,
buffer: Buffer,
}
impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
type Item = char;
fn next(&mut self) -> Option<Self::Item> {
while self.buffer.ready.end == 0 {
match self.iter.next() {
Some(ch) => {
decompose(ch, &mut self.buffer);
}
None => {
if self.buffer.buffer.is_empty() {
return None;
} else {
self.buffer.sort_pending();
self.buffer.ready.end = self.buffer.buffer.len();
break;
}
}
}
}
let (_, ch) = self.buffer.buffer[self.buffer.ready.start];
self.buffer.increment_next_ready();
Some(ch)
}
fn size_hint(&self) -> (usize, Option<usize>) {
let (lower, _) = self.iter.size_hint();
(lower, None)
}
}
// Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
// http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
const S_BASE: u32 = 0xAC00;
const L_BASE: u32 = 0x1100;
const V_BASE: u32 = 0x1161;
const T_BASE: u32 = 0x11A7;
const L_COUNT: u32 = 19;
const V_COUNT: u32 = 21;
const T_COUNT: u32 = 28;
const N_COUNT: u32 = V_COUNT * T_COUNT;
const S_COUNT: u32 = L_COUNT * N_COUNT;
fn decompose(c: char, buffer: &mut Buffer) {
// 7-bit ASCII never decomposes
if c <= '\x7f' {
buffer.push_back(c);
return;
}
// Perform decomposition for Hangul
if is_hangul_syllable(c) {
decompose_hangul(c, buffer);
return;
}
if let Some(decomposed) = table::lookup(c).decomposition() {
match decomposed {
Decomposition::Single(f) => {
decompose(f, buffer);
}
Decomposition::Double(f, s) => {
decompose(f, buffer);
decompose(s, buffer);
}
}
return;
}
// Finally bottom out.
buffer.push_back(c);
}
fn is_hangul_syllable(c: char) -> bool {
(c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT)
}
#[allow(unsafe_code)]
fn decompose_hangul(s: char, buffer: &mut Buffer) {
let s_index = s as u32 - S_BASE;
let l_index = s_index / N_COUNT;
unsafe {
buffer.push_back(char::from_u32_unchecked(L_BASE + l_index));
let v_index = (s_index % N_COUNT) / T_COUNT;
buffer.push_back(char::from_u32_unchecked(V_BASE + v_index));
let t_index = s_index % T_COUNT;
if t_index > 0 {
buffer.push_back(char::from_u32_unchecked(T_BASE + t_index));
}
}
}
#[cfg(test)]
mod tests {
use std::fs::File;
use std::io::{BufRead, BufReader};
use super::*;
#[test]
fn test_unicode_normalization() {
let data = File::open("data/NormalizationTest.txt")
.map(BufReader::new)
.expect("unicode normalization test file");
#[derive(Default)]
struct Entry {
source: String,
nfc: String,
nfd: String,
nfkc: String,
nfkd: String,
comment: String,
}
for (i, line) in data.lines().enumerate() {
let line = line.expect("line");
if line.is_empty() {
continue;
}
if line.starts_with(['#', '@']) {
continue;
}
let entry =
line.splitn(6, ';')
.enumerate()
.fold(Entry::default(), |mut entry, (i, string)| {
match i {
0 => {
entry.source = string
.split(' ')
.map(|v| {
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
})
.map(|v| char::from_u32(v).expect("valid char"))
.collect::<String>()
}
1 => {
entry.nfc = string
.split(' ')
.map(|v| {
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
})
.map(|v| char::from_u32(v).expect("valid char"))
.collect::<String>()
}
2 => {
entry.nfd = string
.split(' ')
.map(|v| {
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
})
.map(|v| char::from_u32(v).expect("valid char"))
.collect::<String>()
}
3 => {
entry.nfkc = string
.split(' ')
.map(|v| {
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
})
.map(|v| char::from_u32(v).expect("valid char"))
.collect::<String>()
}
4 => {
entry.nfkd = string
.split(' ')
.map(|v| {
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
})
.map(|v| char::from_u32(v).expect("valid char"))
.collect::<String>()
}
5 => {
entry.comment =
string.trim_start_matches(['#', ' ']).trim_end().to_string()
}
_ => unreachable!(),
}
entry
});
// c3 == toNFD(c1) == toNFD(c2) == toNFD(c3)
similar_asserts::assert_str_eq!(
nfd(&entry.source).collect::<String>(),
entry.nfd,
"c3 == toNFD(c1) at line {} # {}",
i + 1,
entry.comment
);
similar_asserts::assert_str_eq!(
nfd(&entry.nfc).collect::<String>(),
entry.nfd,
"c3 == toNFD(c2) at line {} # {}",
i + 1,
entry.comment
);
similar_asserts::assert_str_eq!(
nfd(&entry.nfd).collect::<String>(),
entry.nfd,
"c3 == toNFD(c3) at line {} # {}",
i + 1,
entry.comment
);
// c5 == toNFD(c4) == toNFD(c5)
similar_asserts::assert_str_eq!(
nfd(&entry.nfkc).collect::<String>(),
entry.nfkd,
"c5 == toNFD(c4) at line {} # {}",
i + 1,
entry.comment
);
similar_asserts::assert_str_eq!(
nfd(&entry.nfkd).collect::<String>(),
entry.nfkd,
"c5 == toNFD(c5) at line {} # {}",
i + 1,
entry.comment
);
}
}
}

114
crates/unf/src/table.rs Normal file
View File

@@ -0,0 +1,114 @@
use ufst::raw::Fst;
const TABLE: Fst<&'static [u8]> =
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
pub fn lookup(ch: char) -> Entry {
Entry::new(
TABLE
.get((ch as u32).to_ne_bytes())
.map(|output| output.value())
.unwrap_or(0),
)
}
#[derive(Clone, Copy, PartialEq, Debug)]
pub enum Decomposition {
Single(char),
Double(char, char),
}
#[derive(Clone, Copy, PartialEq, Debug)]
pub struct Entry(u64);
impl Entry {
pub(crate) fn new(data: u64) -> Self {
Self(data)
}
pub fn combining_class(&self) -> u8 {
(self.0 & 0xFF) as u8
}
pub(crate) fn decomposition(&self) -> Option<Decomposition> {
let m1 = ((self.0 >> 8) & 0x1FFFFF) as u32;
if m1 > 0 {
let m2 = ((self.0 >> 29) & 0x1FFFFF) as u32;
if m2 > 0 {
unsafe {
Some(Decomposition::Double(
char::from_u32_unchecked(m1),
char::from_u32_unchecked(m2),
))
}
} else {
unsafe { Some(Decomposition::Single(char::from_u32_unchecked(m1))) }
}
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use proptest::prelude::*;
use super::*;
fn entry_strategy() -> impl Strategy<Value = (u64, (u8, u8, char, char))> {
(
any::<u8>(),
(0u8..2),
any::<char>().prop_filter("", |c| *c != '\u{0}'),
any::<char>().prop_filter("", |c| *c != '\u{0}'),
)
.prop_map(
|(combining_class, mapping_count, decomposition_first, decomposition_second)| {
let mut entry = combining_class as u64;
if mapping_count > 0 {
entry |= (decomposition_first as u64) << 8;
}
if mapping_count > 1 {
entry |= (decomposition_second as u64) << (21 + 8);
}
(
entry,
(
combining_class,
mapping_count,
decomposition_first,
decomposition_second,
),
)
},
)
}
proptest! {
#[test]
fn proptest_entry_serialize_and_deserialize(a in entry_strategy()) {
let (data, (combining_class, mapping_count, decomposition_first, decomposition_second)) = a;
let b = Entry::new(data);
prop_assert_eq!(b.combining_class(), combining_class, "data = {:064b}", data);
match mapping_count {
0 => prop_assert_eq!(b.decomposition(), None, "data = {:064b}", data),
1 => prop_assert_eq!(b.decomposition(), Some(Decomposition::Single(decomposition_first)), "data = {:064b}", data),
2 => prop_assert_eq!(b.decomposition(), Some(Decomposition::Double(decomposition_first, decomposition_second)), "data = {:064b}", data),
_ => unreachable!(),
}
// prop_assert_eq!(a, b, "data = {:064b}", data);
}
}
}