Initial commit
This commit is contained in:
@@ -0,0 +1,16 @@
|
||||
[package]
|
||||
name = "unf"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
fst = "0.4.7"
|
||||
tinyvec = { version = "1.6.0", features = ["alloc"] }
|
||||
ufst = { path = "../ufst" }
|
||||
|
||||
[build-dependencies]
|
||||
ufst = { path = "../ufst" }
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1.0.0"
|
||||
similar-asserts = "1.2.0"
|
||||
@@ -0,0 +1,66 @@
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
use ufst::raw::Fst;
|
||||
|
||||
fn main() {
|
||||
let data = fs::read_to_string("data/UnicodeData.txt").unwrap();
|
||||
|
||||
let out_dir = env::var_os("OUT_DIR").unwrap();
|
||||
let dest_path = Path::new(&out_dir).join("table.fst");
|
||||
|
||||
let mut entries = parse(&data)
|
||||
.into_iter()
|
||||
.map(|(code_value, entry)| (code_value.to_ne_bytes(), entry))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
entries.sort_unstable_by_key(|(k, _)| *k);
|
||||
|
||||
let data = Fst::from_iter_map(entries).unwrap().into_inner();
|
||||
|
||||
fs::write(&dest_path, data).unwrap();
|
||||
|
||||
println!("cargo:rerun-if-changed=data/UnicodeData.txt");
|
||||
println!("cargo:rerun-if-changed=build.rs");
|
||||
}
|
||||
|
||||
fn parse(data: &str) -> Vec<(u32, u64)> {
|
||||
let mut entries = Vec::new();
|
||||
|
||||
for line in data.lines() {
|
||||
let mut iter = line.split(';');
|
||||
|
||||
let code_point = iter
|
||||
.next()
|
||||
.map(|s| u32::from_str_radix(s, 16).expect("valid u32"))
|
||||
.expect("code value");
|
||||
|
||||
let combining_class = iter
|
||||
.nth(2)
|
||||
.map(|s| s.parse::<u8>().expect("valid u8"))
|
||||
.expect("canonical combining classes");
|
||||
|
||||
let mut entry = combining_class as u64;
|
||||
|
||||
let decomposition_mapping = iter.nth(1).unwrap();
|
||||
|
||||
if !decomposition_mapping.starts_with('<') {
|
||||
let mappings = decomposition_mapping
|
||||
.split(' ')
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(|s| u32::from_str_radix(s, 16).expect("valid u32"))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert!(mappings.len() <= 2);
|
||||
|
||||
for (i, mapping) in mappings.into_iter().enumerate() {
|
||||
entry |= (mapping as u64) << ((21 * i) + 8);
|
||||
}
|
||||
}
|
||||
|
||||
entries.push((code_point, entry));
|
||||
}
|
||||
|
||||
entries
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,309 @@
|
||||
use std::iter::Fuse;
|
||||
use std::ops::Range;
|
||||
use std::str::Chars;
|
||||
|
||||
use tinyvec::TinyVec;
|
||||
|
||||
pub mod table;
|
||||
|
||||
use table::Decomposition;
|
||||
|
||||
pub fn nfd(s: &str) -> Decompositions<Chars<'_>> {
|
||||
Decompositions {
|
||||
iter: s.chars().fuse(),
|
||||
buffer: Buffer::new(),
|
||||
}
|
||||
}
|
||||
|
||||
struct Buffer {
|
||||
buffer: TinyVec<[(u8, char); 4]>,
|
||||
ready: Range<usize>,
|
||||
}
|
||||
|
||||
impl Buffer {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
buffer: TinyVec::new(),
|
||||
ready: 0..0,
|
||||
}
|
||||
}
|
||||
|
||||
fn push_back(&mut self, ch: char) {
|
||||
let class = table::lookup(ch).combining_class();
|
||||
|
||||
if class == 0 {
|
||||
self.sort_pending();
|
||||
|
||||
self.buffer.push((class, ch));
|
||||
self.ready.end = self.buffer.len();
|
||||
} else {
|
||||
self.buffer.push((class, ch));
|
||||
}
|
||||
}
|
||||
|
||||
fn sort_pending(&mut self) {
|
||||
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
|
||||
}
|
||||
|
||||
fn reset(&mut self) {
|
||||
let pending = self.buffer.len() - self.ready.end;
|
||||
|
||||
for i in 0..pending {
|
||||
self.buffer[i] = self.buffer[i + self.ready.end];
|
||||
}
|
||||
|
||||
self.buffer.truncate(pending);
|
||||
self.ready = 0..0;
|
||||
}
|
||||
|
||||
fn increment_next_ready(&mut self) {
|
||||
let next = self.ready.start + 1;
|
||||
|
||||
if next == self.ready.end {
|
||||
self.reset();
|
||||
} else {
|
||||
self.ready.start = next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Decompositions<I> {
|
||||
iter: Fuse<I>,
|
||||
buffer: Buffer,
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
|
||||
type Item = char;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
while self.buffer.ready.end == 0 {
|
||||
match self.iter.next() {
|
||||
Some(ch) => {
|
||||
decompose(ch, &mut self.buffer);
|
||||
}
|
||||
None => {
|
||||
if self.buffer.buffer.is_empty() {
|
||||
return None;
|
||||
} else {
|
||||
self.buffer.sort_pending();
|
||||
self.buffer.ready.end = self.buffer.buffer.len();
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let (_, ch) = self.buffer.buffer[self.buffer.ready.start];
|
||||
|
||||
self.buffer.increment_next_ready();
|
||||
|
||||
Some(ch)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let (lower, _) = self.iter.size_hint();
|
||||
|
||||
(lower, None)
|
||||
}
|
||||
}
|
||||
|
||||
// Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
|
||||
// http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
|
||||
const S_BASE: u32 = 0xAC00;
|
||||
const L_BASE: u32 = 0x1100;
|
||||
const V_BASE: u32 = 0x1161;
|
||||
const T_BASE: u32 = 0x11A7;
|
||||
const L_COUNT: u32 = 19;
|
||||
const V_COUNT: u32 = 21;
|
||||
const T_COUNT: u32 = 28;
|
||||
const N_COUNT: u32 = V_COUNT * T_COUNT;
|
||||
const S_COUNT: u32 = L_COUNT * N_COUNT;
|
||||
|
||||
fn decompose(c: char, buffer: &mut Buffer) {
|
||||
// 7-bit ASCII never decomposes
|
||||
if c <= '\x7f' {
|
||||
buffer.push_back(c);
|
||||
return;
|
||||
}
|
||||
|
||||
// Perform decomposition for Hangul
|
||||
if is_hangul_syllable(c) {
|
||||
decompose_hangul(c, buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
if let Some(decomposed) = table::lookup(c).decomposition() {
|
||||
match decomposed {
|
||||
Decomposition::Single(f) => {
|
||||
decompose(f, buffer);
|
||||
}
|
||||
Decomposition::Double(f, s) => {
|
||||
decompose(f, buffer);
|
||||
decompose(s, buffer);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Finally bottom out.
|
||||
buffer.push_back(c);
|
||||
}
|
||||
|
||||
fn is_hangul_syllable(c: char) -> bool {
|
||||
(c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT)
|
||||
}
|
||||
|
||||
#[allow(unsafe_code)]
|
||||
fn decompose_hangul(s: char, buffer: &mut Buffer) {
|
||||
let s_index = s as u32 - S_BASE;
|
||||
let l_index = s_index / N_COUNT;
|
||||
|
||||
unsafe {
|
||||
buffer.push_back(char::from_u32_unchecked(L_BASE + l_index));
|
||||
|
||||
let v_index = (s_index % N_COUNT) / T_COUNT;
|
||||
buffer.push_back(char::from_u32_unchecked(V_BASE + v_index));
|
||||
|
||||
let t_index = s_index % T_COUNT;
|
||||
if t_index > 0 {
|
||||
buffer.push_back(char::from_u32_unchecked(T_BASE + t_index));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_unicode_normalization() {
|
||||
let data = File::open("data/NormalizationTest.txt")
|
||||
.map(BufReader::new)
|
||||
.expect("unicode normalization test file");
|
||||
|
||||
#[derive(Default)]
|
||||
struct Entry {
|
||||
source: String,
|
||||
nfc: String,
|
||||
nfd: String,
|
||||
nfkc: String,
|
||||
nfkd: String,
|
||||
comment: String,
|
||||
}
|
||||
|
||||
for (i, line) in data.lines().enumerate() {
|
||||
let line = line.expect("line");
|
||||
|
||||
if line.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if line.starts_with(['#', '@']) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let entry =
|
||||
line.splitn(6, ';')
|
||||
.enumerate()
|
||||
.fold(Entry::default(), |mut entry, (i, string)| {
|
||||
match i {
|
||||
0 => {
|
||||
entry.source = string
|
||||
.split(' ')
|
||||
.map(|v| {
|
||||
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
|
||||
})
|
||||
.map(|v| char::from_u32(v).expect("valid char"))
|
||||
.collect::<String>()
|
||||
}
|
||||
1 => {
|
||||
entry.nfc = string
|
||||
.split(' ')
|
||||
.map(|v| {
|
||||
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
|
||||
})
|
||||
.map(|v| char::from_u32(v).expect("valid char"))
|
||||
.collect::<String>()
|
||||
}
|
||||
2 => {
|
||||
entry.nfd = string
|
||||
.split(' ')
|
||||
.map(|v| {
|
||||
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
|
||||
})
|
||||
.map(|v| char::from_u32(v).expect("valid char"))
|
||||
.collect::<String>()
|
||||
}
|
||||
3 => {
|
||||
entry.nfkc = string
|
||||
.split(' ')
|
||||
.map(|v| {
|
||||
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
|
||||
})
|
||||
.map(|v| char::from_u32(v).expect("valid char"))
|
||||
.collect::<String>()
|
||||
}
|
||||
4 => {
|
||||
entry.nfkd = string
|
||||
.split(' ')
|
||||
.map(|v| {
|
||||
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
|
||||
})
|
||||
.map(|v| char::from_u32(v).expect("valid char"))
|
||||
.collect::<String>()
|
||||
}
|
||||
5 => {
|
||||
entry.comment =
|
||||
string.trim_start_matches(['#', ' ']).trim_end().to_string()
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
entry
|
||||
});
|
||||
|
||||
// c3 == toNFD(c1) == toNFD(c2) == toNFD(c3)
|
||||
similar_asserts::assert_str_eq!(
|
||||
nfd(&entry.source).collect::<String>(),
|
||||
entry.nfd,
|
||||
"c3 == toNFD(c1) at line {} # {}",
|
||||
i + 1,
|
||||
entry.comment
|
||||
);
|
||||
similar_asserts::assert_str_eq!(
|
||||
nfd(&entry.nfc).collect::<String>(),
|
||||
entry.nfd,
|
||||
"c3 == toNFD(c2) at line {} # {}",
|
||||
i + 1,
|
||||
entry.comment
|
||||
);
|
||||
similar_asserts::assert_str_eq!(
|
||||
nfd(&entry.nfd).collect::<String>(),
|
||||
entry.nfd,
|
||||
"c3 == toNFD(c3) at line {} # {}",
|
||||
i + 1,
|
||||
entry.comment
|
||||
);
|
||||
|
||||
// c5 == toNFD(c4) == toNFD(c5)
|
||||
similar_asserts::assert_str_eq!(
|
||||
nfd(&entry.nfkc).collect::<String>(),
|
||||
entry.nfkd,
|
||||
"c5 == toNFD(c4) at line {} # {}",
|
||||
i + 1,
|
||||
entry.comment
|
||||
);
|
||||
similar_asserts::assert_str_eq!(
|
||||
nfd(&entry.nfkd).collect::<String>(),
|
||||
entry.nfkd,
|
||||
"c5 == toNFD(c5) at line {} # {}",
|
||||
i + 1,
|
||||
entry.comment
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,114 @@
|
||||
use ufst::raw::Fst;
|
||||
|
||||
const TABLE: Fst<&'static [u8]> =
|
||||
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
|
||||
|
||||
pub fn lookup(ch: char) -> Entry {
|
||||
Entry::new(
|
||||
TABLE
|
||||
.get((ch as u32).to_ne_bytes())
|
||||
.map(|output| output.value())
|
||||
.unwrap_or(0),
|
||||
)
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Debug)]
|
||||
pub enum Decomposition {
|
||||
Single(char),
|
||||
Double(char, char),
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Debug)]
|
||||
pub struct Entry(u64);
|
||||
|
||||
impl Entry {
|
||||
pub(crate) fn new(data: u64) -> Self {
|
||||
Self(data)
|
||||
}
|
||||
|
||||
pub fn combining_class(&self) -> u8 {
|
||||
(self.0 & 0xFF) as u8
|
||||
}
|
||||
|
||||
pub(crate) fn decomposition(&self) -> Option<Decomposition> {
|
||||
let m1 = ((self.0 >> 8) & 0x1FFFFF) as u32;
|
||||
|
||||
if m1 > 0 {
|
||||
let m2 = ((self.0 >> 29) & 0x1FFFFF) as u32;
|
||||
|
||||
if m2 > 0 {
|
||||
unsafe {
|
||||
Some(Decomposition::Double(
|
||||
char::from_u32_unchecked(m1),
|
||||
char::from_u32_unchecked(m2),
|
||||
))
|
||||
}
|
||||
} else {
|
||||
unsafe { Some(Decomposition::Single(char::from_u32_unchecked(m1))) }
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use proptest::prelude::*;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn entry_strategy() -> impl Strategy<Value = (u64, (u8, u8, char, char))> {
|
||||
(
|
||||
any::<u8>(),
|
||||
(0u8..2),
|
||||
any::<char>().prop_filter("", |c| *c != '\u{0}'),
|
||||
any::<char>().prop_filter("", |c| *c != '\u{0}'),
|
||||
)
|
||||
.prop_map(
|
||||
|(combining_class, mapping_count, decomposition_first, decomposition_second)| {
|
||||
let mut entry = combining_class as u64;
|
||||
|
||||
if mapping_count > 0 {
|
||||
entry |= (decomposition_first as u64) << 8;
|
||||
}
|
||||
|
||||
if mapping_count > 1 {
|
||||
entry |= (decomposition_second as u64) << (21 + 8);
|
||||
}
|
||||
|
||||
(
|
||||
entry,
|
||||
(
|
||||
combining_class,
|
||||
mapping_count,
|
||||
decomposition_first,
|
||||
decomposition_second,
|
||||
),
|
||||
)
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn proptest_entry_serialize_and_deserialize(a in entry_strategy()) {
|
||||
let (data, (combining_class, mapping_count, decomposition_first, decomposition_second)) = a;
|
||||
|
||||
let b = Entry::new(data);
|
||||
|
||||
prop_assert_eq!(b.combining_class(), combining_class, "data = {:064b}", data);
|
||||
|
||||
match mapping_count {
|
||||
0 => prop_assert_eq!(b.decomposition(), None, "data = {:064b}", data),
|
||||
1 => prop_assert_eq!(b.decomposition(), Some(Decomposition::Single(decomposition_first)), "data = {:064b}", data),
|
||||
2 => prop_assert_eq!(b.decomposition(), Some(Decomposition::Double(decomposition_first, decomposition_second)), "data = {:064b}", data),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
|
||||
|
||||
// prop_assert_eq!(a, b, "data = {:064b}", data);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user