Initial commit
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
/target
|
||||
/Cargo.lock
|
||||
2
Cargo.toml
Normal file
2
Cargo.toml
Normal file
@@ -0,0 +1,2 @@
|
||||
[workspace]
|
||||
members = ["crates/*"]
|
||||
2
README.md
Normal file
2
README.md
Normal file
@@ -0,0 +1,2 @@
|
||||
https://icu4c-demos.unicode.org/icu-bin/collation.html
|
||||
https://github.com/tertsdiepraam/collate
|
||||
7
crates/parse/Cargo.toml
Normal file
7
crates/parse/Cargo.toml
Normal file
@@ -0,0 +1,7 @@
|
||||
[package]
|
||||
name = "parse"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dev-dependencies]
|
||||
similar-asserts = "1.2.0"
|
||||
1
crates/parse/src/lib.rs
Normal file
1
crates/parse/src/lib.rs
Normal file
@@ -0,0 +1 @@
|
||||
pub mod uca;
|
||||
1
crates/parse/src/uca.rs
Normal file
1
crates/parse/src/uca.rs
Normal file
@@ -0,0 +1 @@
|
||||
pub mod allkeys;
|
||||
192
crates/parse/src/uca/allkeys.rs
Normal file
192
crates/parse/src/uca/allkeys.rs
Normal file
@@ -0,0 +1,192 @@
|
||||
//! Parse allkeys.txt
|
||||
//!
|
||||
//! See http://unicode.org/reports/tr10/#File_Format for information about file format.
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct AllKeys {
|
||||
pub version: Option<Version>,
|
||||
pub implicit_weights: Vec<ImplicitWeight>,
|
||||
pub entries: Vec<Entry>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Version {
|
||||
pub major: u16,
|
||||
pub minor: u16,
|
||||
pub variant: u16,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ImplicitWeight {
|
||||
pub start: u32,
|
||||
pub end: u32,
|
||||
pub base: u32,
|
||||
pub comment: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Entry {
|
||||
pub chars: Vec<u32>,
|
||||
pub elements: Vec<Element>,
|
||||
pub comment: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct Element {
|
||||
pub l1: u16,
|
||||
pub l2: u16,
|
||||
pub l3: u8,
|
||||
pub l4: u16,
|
||||
pub variable: bool,
|
||||
}
|
||||
|
||||
pub fn parse(input: &str) -> AllKeys {
|
||||
let mut all_keys = AllKeys::default();
|
||||
|
||||
for line in input.lines() {
|
||||
let line = line.trim();
|
||||
|
||||
// If the line is empty, there is nothing to do wight it
|
||||
if line.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// If a line starts with '#', it is a comment, so skip it
|
||||
if line.starts_with('#') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if line.starts_with("@version") {
|
||||
let mut iter = line.trim_start_matches("@version").trim().splitn(3, '.');
|
||||
|
||||
all_keys.version = Some(Version {
|
||||
major: iter.next().unwrap().parse().unwrap(),
|
||||
minor: iter.next().unwrap().parse().unwrap(),
|
||||
variant: iter.next().unwrap().parse().unwrap(),
|
||||
});
|
||||
} else if line.starts_with("@implicitweights") {
|
||||
let (range, base) = line
|
||||
.trim_start_matches("@implicitweights")
|
||||
.trim()
|
||||
.split_once(';')
|
||||
.unwrap();
|
||||
|
||||
let (start, end) = range.split_once("..").unwrap();
|
||||
let (base, comment) = base.split_once('#').unwrap();
|
||||
|
||||
let comment = comment.trim();
|
||||
|
||||
all_keys.implicit_weights.push(ImplicitWeight {
|
||||
start: u32::from_str_radix(start.trim(), 16).unwrap(),
|
||||
end: u32::from_str_radix(end.trim(), 16).unwrap(),
|
||||
base: u32::from_str_radix(base.trim(), 16).unwrap(),
|
||||
comment: if !comment.is_empty() {
|
||||
Some(comment.to_string())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
});
|
||||
} else {
|
||||
let (chars, rest) = line.split_once(';').unwrap();
|
||||
|
||||
let chars = chars
|
||||
.trim()
|
||||
.split(' ')
|
||||
.map(|x| u32::from_str_radix(x, 16).unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let (elements, comment) = rest.split_once('#').unwrap();
|
||||
let comment = comment.trim();
|
||||
|
||||
let elements = elements
|
||||
.split("][")
|
||||
.map(|coll_element| {
|
||||
let coll_element = coll_element
|
||||
.trim()
|
||||
.trim_start_matches('[')
|
||||
.trim_end_matches(']');
|
||||
|
||||
let variable = coll_element.starts_with('*');
|
||||
|
||||
let mut iter = coll_element
|
||||
.trim_start_matches(['.', '*'])
|
||||
.split(['.', '*']);
|
||||
|
||||
Element {
|
||||
l1: iter
|
||||
.next()
|
||||
.and_then(|x| u16::from_str_radix(x, 16).ok())
|
||||
.expect("valid l1 value"),
|
||||
l2: iter
|
||||
.next()
|
||||
.and_then(|x| u16::from_str_radix(x, 16).ok())
|
||||
.expect("valid l2 value"),
|
||||
l3: iter
|
||||
.next()
|
||||
.and_then(|x| u8::from_str_radix(x, 16).ok())
|
||||
.expect("valid l3 value"),
|
||||
l4: iter
|
||||
.next()
|
||||
.map(|x| u16::from_str_radix(x, 16).expect("valid l4 value"))
|
||||
.unwrap_or(0),
|
||||
variable,
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
all_keys.entries.push(Entry {
|
||||
chars,
|
||||
elements,
|
||||
comment: if !comment.is_empty() {
|
||||
Some(comment.to_string())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
all_keys
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_allkeys() {
|
||||
let data = std::fs::read_to_string("data/allkeys.txt").unwrap();
|
||||
|
||||
let all_keys = parse(&data);
|
||||
|
||||
similar_asserts::assert_eq!(
|
||||
all_keys
|
||||
.entries
|
||||
.iter()
|
||||
.find(|entry| entry.chars[..] == [0x1abc])
|
||||
.map(|entry| &entry.elements),
|
||||
Some(&vec![Element {
|
||||
l1: 0,
|
||||
l2: 51,
|
||||
l3: 2,
|
||||
l4: 0,
|
||||
variable: false
|
||||
}])
|
||||
);
|
||||
|
||||
similar_asserts::assert_eq!(
|
||||
all_keys
|
||||
.entries
|
||||
.iter()
|
||||
.find(|entry| entry.chars[..] == [0x1ac1])
|
||||
.map(|entry| &entry.elements),
|
||||
Some(&vec![Element {
|
||||
l1: 0,
|
||||
l2: 51,
|
||||
l3: 2,
|
||||
l4: 0,
|
||||
variable: false
|
||||
}])
|
||||
);
|
||||
}
|
||||
}
|
||||
17
crates/smol-uca/Cargo.toml
Normal file
17
crates/smol-uca/Cargo.toml
Normal file
@@ -0,0 +1,17 @@
|
||||
[package]
|
||||
name = "smol-uca"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
ufst = { path = "../ufst" }
|
||||
unf = { path = "../unf" }
|
||||
|
||||
[build-dependencies]
|
||||
bytemuck = "1.9.1"
|
||||
parse = { path = "../parse" }
|
||||
ufst = { path = "../ufst" }
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1.0.0"
|
||||
similar-asserts = "1.2.0"
|
||||
16
crates/smol-uca/README.md
Normal file
16
crates/smol-uca/README.md
Normal file
@@ -0,0 +1,16 @@
|
||||
# Smol UCA
|
||||
|
||||
Implementation of [Unicode Collation Algorithm](https://unicode.org/reports/tr10/).
|
||||
|
||||
## Todo
|
||||
|
||||
- [x] Build fst on build
|
||||
- [x] Switch to ufst
|
||||
- [ ] Add benchmarks
|
||||
|
||||
## See
|
||||
|
||||
- [ziglyph](https://github.com/jecolon/ziglyph/blob/main/src/collator/Collator.zig)
|
||||
- [pyuca](https://github.com/jtauber/pyuca)
|
||||
- [collate](https://github.com/tertsdiepraam/collate)
|
||||
- [UCA Auxiliary Files](http://www.unicode.org/Public/UCA/6.0.0/CollationAuxiliary.html)
|
||||
108
crates/smol-uca/build.rs
Normal file
108
crates/smol-uca/build.rs
Normal file
@@ -0,0 +1,108 @@
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
use parse::uca::allkeys;
|
||||
use ufst::raw::Builder;
|
||||
|
||||
fn main() {
|
||||
println!("cargo:rerun-if-changed=data/allkeys.txt");
|
||||
println!("cargo:rerun-if-changed=build.rs");
|
||||
|
||||
let all_keys = {
|
||||
let data = std::fs::read_to_string("data/allkeys.txt").unwrap();
|
||||
|
||||
allkeys::parse(&data)
|
||||
};
|
||||
|
||||
let out_dir = env::var_os("OUT_DIR").unwrap();
|
||||
|
||||
let mut implicit_weights = String::new();
|
||||
|
||||
implicit_weights.push_str(&format!(
|
||||
"pub const IMPLICIT_WEIGHTS: [(u32, u32, u32); {}] = [\n",
|
||||
all_keys.implicit_weights.len(),
|
||||
));
|
||||
|
||||
for implicit_weight in all_keys.implicit_weights {
|
||||
implicit_weights.push_str(&format!(
|
||||
" ({}, {}, {}),\n",
|
||||
implicit_weight.start, implicit_weight.end, implicit_weight.base
|
||||
));
|
||||
}
|
||||
|
||||
implicit_weights.push_str("];\n");
|
||||
|
||||
fs::write(
|
||||
&Path::new(&out_dir).join("implicit_weights.rs"),
|
||||
implicit_weights,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let mut entries = all_keys
|
||||
.entries
|
||||
.into_iter()
|
||||
.map(|entry| {
|
||||
(
|
||||
bytemuck::cast_slice::<u32, u8>(&entry.chars).to_vec(),
|
||||
entry.elements,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
entries.sort_unstable_by(|(a, _), (b, _)| a.cmp(b));
|
||||
|
||||
let mut builder = Builder::memory();
|
||||
let mut overflow = Vec::new();
|
||||
|
||||
for (chars, mut elements) in entries.into_iter() {
|
||||
let value = match elements.len() {
|
||||
1 => {
|
||||
let element = elements.pop().unwrap();
|
||||
|
||||
((element.l4 as u64) << 42)
|
||||
| ((element.l3 as u64) << 34)
|
||||
| ((element.l2 as u64) << 18)
|
||||
| ((element.l1 as u64) << 2)
|
||||
| (if element.variable { 1 } else { 0 } << 1)
|
||||
}
|
||||
2.. => {
|
||||
let idx = overflow.len();
|
||||
let len = elements.len();
|
||||
|
||||
overflow.extend(elements.into_iter());
|
||||
|
||||
((idx as u64) << 9) | ((len as u64) << 1) | 1
|
||||
}
|
||||
_ => panic!("this shouldn't happen!"),
|
||||
};
|
||||
|
||||
builder.insert(chars, value).unwrap();
|
||||
}
|
||||
|
||||
let data = builder.into_fst().into_inner();
|
||||
|
||||
fs::write(&Path::new(&out_dir).join("table.fst"), data).unwrap();
|
||||
|
||||
let mut explicit_weights = String::new();
|
||||
|
||||
explicit_weights.push_str(&format!(
|
||||
"pub const EXPLICIT_WEIGHTS: [(u16, u16, u8, u16, bool); {}] = [\n",
|
||||
overflow.len(),
|
||||
));
|
||||
|
||||
for element in overflow {
|
||||
explicit_weights.push_str(&format!(
|
||||
" ({}, {}, {}, {}, {}),\n",
|
||||
element.l1, element.l2, element.l3, element.l4, element.variable,
|
||||
));
|
||||
}
|
||||
|
||||
explicit_weights.push_str("];\n");
|
||||
|
||||
fs::write(
|
||||
&Path::new(&out_dir).join("explicit_weights.rs"),
|
||||
explicit_weights,
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
211458
crates/smol-uca/data/CollationTest_NON_IGNORABLE.txt
Normal file
211458
crates/smol-uca/data/CollationTest_NON_IGNORABLE.txt
Normal file
File diff suppressed because it is too large
Load Diff
227503
crates/smol-uca/data/CollationTest_SHIFTED.txt
Normal file
227503
crates/smol-uca/data/CollationTest_SHIFTED.txt
Normal file
File diff suppressed because it is too large
Load Diff
33925
crates/smol-uca/data/allkeys.txt
Normal file
33925
crates/smol-uca/data/allkeys.txt
Normal file
File diff suppressed because it is too large
Load Diff
373
crates/smol-uca/src/collator.rs
Normal file
373
crates/smol-uca/src/collator.rs
Normal file
@@ -0,0 +1,373 @@
|
||||
use std::iter;
|
||||
|
||||
use crate::table::{self, Element};
|
||||
use crate::weights::IMPLICIT_WEIGHTS;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Collator;
|
||||
|
||||
impl Collator {
|
||||
fn collation_elements(&self, normalized: &[char]) -> Vec<Element> {
|
||||
let debug = false;
|
||||
|
||||
let mut all_elements = Vec::new();
|
||||
|
||||
let mut code_points = normalized.to_vec();
|
||||
let mut code_points_len = code_points.len();
|
||||
let mut cp_index = 0;
|
||||
|
||||
if debug {
|
||||
eprintln!(
|
||||
"nfd: {}",
|
||||
normalized
|
||||
.iter()
|
||||
.map(|c| format!("{:04X}", *c as u32))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
);
|
||||
}
|
||||
|
||||
while cp_index < code_points_len {
|
||||
let (mut elements, idx) = table::lookup(&code_points[cp_index..]).unwrap_or_default();
|
||||
|
||||
if debug {
|
||||
eprintln!(
|
||||
"found at 1: [{}], idx: {}, start: {:04X}",
|
||||
elements
|
||||
.iter()
|
||||
.map(ToString::to_string)
|
||||
.collect::<Vec<_>>()
|
||||
.join("]["),
|
||||
idx,
|
||||
code_points[cp_index] as u32
|
||||
);
|
||||
}
|
||||
|
||||
let s = &code_points[0..cp_index + idx + 1];
|
||||
|
||||
// handle non-starters
|
||||
let mut last_class = None;
|
||||
let tail_start = cp_index + idx + 1;
|
||||
let mut tail_index = tail_start;
|
||||
|
||||
// advance to last combining C
|
||||
while tail_index < code_points_len {
|
||||
let combining_class = unf::table::lookup(code_points[tail_index]).combining_class();
|
||||
|
||||
if debug {
|
||||
eprintln!(
|
||||
"combining class: {}, start: {:04X}",
|
||||
combining_class, code_points[tail_index] as u32
|
||||
);
|
||||
}
|
||||
|
||||
if combining_class == 0 {
|
||||
if tail_index != tail_start {
|
||||
tail_index -= 1;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
if let Some(last_class) = last_class {
|
||||
if last_class >= combining_class {
|
||||
if tail_index != tail_start {
|
||||
tail_index -= 1;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
last_class = Some(combining_class);
|
||||
tail_index += 1;
|
||||
}
|
||||
|
||||
if tail_index == code_points_len {
|
||||
tail_index -= 1;
|
||||
}
|
||||
|
||||
if tail_index > tail_start {
|
||||
let c = code_points[tail_index];
|
||||
|
||||
let mut new_key = Vec::with_capacity(s.len() + 1);
|
||||
new_key.extend_from_slice(s);
|
||||
new_key.push(c);
|
||||
|
||||
if debug {
|
||||
eprintln!(
|
||||
"new key: {}, s: {}, c: {:04X}",
|
||||
new_key
|
||||
.iter()
|
||||
.map(|c| format!("{:04X}", *c as u32))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" "),
|
||||
s.iter()
|
||||
.map(|c| format!("{:04X}", *c as u32))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" "),
|
||||
c as u32
|
||||
);
|
||||
}
|
||||
|
||||
let (new_elements, new_idx) = table::lookup(&new_key).expect("lookup");
|
||||
|
||||
if debug {
|
||||
eprintln!(
|
||||
"found at 2: [{}], idx: {}, start: {:04X}",
|
||||
new_elements
|
||||
.iter()
|
||||
.map(ToString::to_string)
|
||||
.collect::<Vec<_>>()
|
||||
.join("]["),
|
||||
new_idx,
|
||||
new_key[0] as u32
|
||||
);
|
||||
}
|
||||
|
||||
if new_idx == (new_key.len() - 1) && !new_elements.is_empty() {
|
||||
cp_index = tail_start;
|
||||
|
||||
// splice
|
||||
let mut tmp = Vec::with_capacity(code_points_len - 1);
|
||||
tmp.extend_from_slice(&code_points[0..tail_index]);
|
||||
|
||||
if tail_index + 1 < code_points_len {
|
||||
tmp.extend_from_slice(&code_points[tail_index + 1..]);
|
||||
}
|
||||
|
||||
code_points = tmp;
|
||||
code_points_len = code_points.len();
|
||||
|
||||
if debug {
|
||||
eprintln!("add part 2 elements to all");
|
||||
}
|
||||
|
||||
// add elements to final collection
|
||||
all_elements.extend(new_elements);
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if elements.is_empty() {
|
||||
if debug {
|
||||
eprintln!("no part 1 elements, use implicit weight");
|
||||
}
|
||||
|
||||
elements = self.implicit_weight(code_points[0] as u32);
|
||||
|
||||
if debug {
|
||||
eprintln!(
|
||||
"found at 3: [{}], start: {:04X}",
|
||||
elements
|
||||
.iter()
|
||||
.map(ToString::to_string)
|
||||
.collect::<Vec<_>>()
|
||||
.join("]["),
|
||||
code_points[0] as u32
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if debug {
|
||||
eprintln!("add part 1 elements to all");
|
||||
}
|
||||
|
||||
// add elements to final collection
|
||||
all_elements.extend(elements);
|
||||
|
||||
cp_index += idx + 1;
|
||||
}
|
||||
|
||||
if debug {
|
||||
eprintln!(
|
||||
"all: [{}]",
|
||||
all_elements
|
||||
.iter()
|
||||
.map(ToString::to_string)
|
||||
.collect::<Vec<_>>()
|
||||
.join("][")
|
||||
);
|
||||
}
|
||||
|
||||
all_elements
|
||||
}
|
||||
|
||||
fn implicit_weight(&self, cp: u32) -> Vec<Element> {
|
||||
let base;
|
||||
let mut aaaa = None;
|
||||
let mut bbbb = 0;
|
||||
|
||||
if is_unified_ideograph(cp)
|
||||
&& ((cp >= 0x4E00 && cp <= 0x9FFF) || (cp >= 0xF900 && cp <= 0xFAFF))
|
||||
{
|
||||
base = 0xFB40;
|
||||
aaaa = Some(base + (cp >> 15));
|
||||
bbbb = (cp & 0x7FFF) | 0x8000;
|
||||
} else if is_unified_ideograph(cp)
|
||||
&& !((cp >= 0x4E00 && cp <= 0x9FFF) || (cp >= 0xF900 && cp <= 0xFAFF))
|
||||
{
|
||||
base = 0xFB80;
|
||||
aaaa = Some(base + (cp >> 15));
|
||||
bbbb = (cp & 0x7FFF) | 0x8000;
|
||||
} else {
|
||||
if let Some((start, _, base)) = IMPLICIT_WEIGHTS
|
||||
.iter()
|
||||
.find(|(start, end, _)| cp >= *start && cp <= *end)
|
||||
{
|
||||
aaaa = Some(*base);
|
||||
bbbb = (cp - *start) | 0x8000;
|
||||
|
||||
if cp >= 0x18D00 && cp <= 0x18D8F {
|
||||
bbbb = (cp - 0x17000) | 0x8000;
|
||||
} else {
|
||||
bbbb = (cp - *start) | 0x8000;
|
||||
}
|
||||
}
|
||||
|
||||
if aaaa.is_none() {
|
||||
base = 0xFBC0;
|
||||
aaaa = Some(base + (cp >> 15));
|
||||
bbbb = (cp & 0x7FFF) | 0x8000;
|
||||
}
|
||||
}
|
||||
|
||||
vec![
|
||||
Element {
|
||||
l1: aaaa.map(|x| (x & 0xFFFF) as u16).unwrap_or(0),
|
||||
l2: 0x0020,
|
||||
l3: 0x0002,
|
||||
l4: 0x0000,
|
||||
variable: false,
|
||||
},
|
||||
Element {
|
||||
l1: (bbbb & 0xFFFF) as u16,
|
||||
l2: 0x0000,
|
||||
l3: 0x0000,
|
||||
l4: 0x0000,
|
||||
variable: false,
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
fn sort_key_from_collation_elements(&self, collation_elements: &[Element]) -> Vec<u16> {
|
||||
let l1 = collation_elements
|
||||
.iter()
|
||||
.map(|element| element.l1)
|
||||
.filter(|x| *x > 0);
|
||||
|
||||
let l2 = collation_elements
|
||||
.iter()
|
||||
.map(|element| element.l2)
|
||||
.filter(|x| *x > 0);
|
||||
|
||||
let l3 = collation_elements
|
||||
.iter()
|
||||
.map(|element| element.l3 as u16)
|
||||
.filter(|x| *x > 0);
|
||||
|
||||
let l4 = collation_elements
|
||||
.iter()
|
||||
.map(|element| element.l4)
|
||||
.filter(|x| *x > 0);
|
||||
|
||||
l1.chain(iter::once(0))
|
||||
.chain(l2)
|
||||
.chain(iter::once(0))
|
||||
.chain(l3)
|
||||
.chain(iter::once(0))
|
||||
.chain(l4)
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn sort_key<S: AsRef<str>>(&self, input: S) -> Vec<u16> {
|
||||
let normalized = unf::nfd(input.as_ref()).collect::<Vec<_>>();
|
||||
let collation_elements = self.collation_elements(&normalized);
|
||||
|
||||
self.sort_key_from_collation_elements(&collation_elements)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_unified_ideograph(cp: u32) -> bool {
|
||||
if cp < 0x3400 || cp > 0x3134a {
|
||||
return false;
|
||||
}
|
||||
|
||||
match cp {
|
||||
0x3400..=0x4dbf => true,
|
||||
0x4e00..=0x9fff => true,
|
||||
0xfa0e..=0xfa0f => true,
|
||||
0xfa11 => true,
|
||||
0xfa13..=0xfa14 => true,
|
||||
0xfa1f => true,
|
||||
0xfa21 => true,
|
||||
0xfa23..=0xfa24 => true,
|
||||
0xfa27..=0xfa29 => true,
|
||||
0x20000..=0x2a6df => true,
|
||||
0x2a700..=0x2b738 => true,
|
||||
0x2b740..=0x2b81d => true,
|
||||
0x2b820..=0x2cea1 => true,
|
||||
0x2ceb0..=0x2ebe0 => true,
|
||||
0x30000..=0x3134a => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::fmt;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_bug_001() {
|
||||
let collator = Collator::default();
|
||||
|
||||
let fixture = "\u{1abc}\u{334}";
|
||||
let sort_key = collator.sort_key(fixture);
|
||||
|
||||
similar_asserts::assert_eq!("| 004A 0033 | 0002 0002 |", fmt(&sort_key));
|
||||
|
||||
let fixture = "\u{1ac1}\u{334}";
|
||||
let sort_key = collator.sort_key(fixture);
|
||||
|
||||
similar_asserts::assert_eq!("| 004A 0033 | 0002 0002 |", fmt(&sort_key));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bug_002() {
|
||||
let collator = Collator::default();
|
||||
|
||||
let fixture = "\u{a8}\u{301}\u{334}";
|
||||
let sort_key = collator.sort_key(fixture);
|
||||
|
||||
similar_asserts::assert_eq!(
|
||||
"04D0 | 0020 004A 0024 | 0002 0002 0002 |",
|
||||
fmt(&sort_key),
|
||||
"nfd: {:?}",
|
||||
unf::nfd(fixture)
|
||||
.map(|ch| format!("{:04X}", ch as u32))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bug_003() {
|
||||
let collator = Collator::default();
|
||||
|
||||
let fixture = "\u{18d00}\u{21}";
|
||||
let sort_key = collator.sort_key(fixture);
|
||||
|
||||
similar_asserts::assert_eq!(
|
||||
"FB00 9D00 0268 | 0020 0020 | 0002 0002 |",
|
||||
fmt(&sort_key),
|
||||
"nfd: {:?}",
|
||||
unf::nfd(fixture)
|
||||
.map(|ch| format!("{:04X}", ch as u32))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
);
|
||||
}
|
||||
}
|
||||
16
crates/smol-uca/src/lib.rs
Normal file
16
crates/smol-uca/src/lib.rs
Normal file
@@ -0,0 +1,16 @@
|
||||
pub mod collator;
|
||||
mod table;
|
||||
mod weights;
|
||||
|
||||
pub fn fmt(sort_key: &[u16]) -> String {
|
||||
use std::borrow::Cow;
|
||||
|
||||
sort_key
|
||||
.iter()
|
||||
.map(|x| match x {
|
||||
0 => Cow::Borrowed("|"),
|
||||
_ => Cow::Owned(format!("{:04X}", x)),
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
}
|
||||
161
crates/smol-uca/src/table.rs
Normal file
161
crates/smol-uca/src/table.rs
Normal file
@@ -0,0 +1,161 @@
|
||||
use std::fmt::Display;
|
||||
|
||||
use ufst::raw::{Fst, Output};
|
||||
|
||||
const TABLE: Fst<&'static [u8]> =
|
||||
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
|
||||
|
||||
pub fn lookup(value: &[char]) -> Option<(Vec<Element>, usize)> {
|
||||
let mut node = TABLE.root();
|
||||
let mut out = Output::zero();
|
||||
|
||||
let mut last_match = None;
|
||||
|
||||
'char: for (i, &c) in value.iter().enumerate() {
|
||||
for b in (c as u32).to_ne_bytes() {
|
||||
if let Some(trans_index) = node.find_input(b) {
|
||||
let t = node.transition(trans_index);
|
||||
|
||||
node = TABLE.node(t.addr);
|
||||
out = out.cat(t.out);
|
||||
|
||||
if node.is_final() {
|
||||
last_match = Some((out.cat(node.final_output()).value(), i));
|
||||
}
|
||||
} else {
|
||||
break 'char;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
last_match.map(|(data, idx)| {
|
||||
(
|
||||
match Value::from_u64(data) {
|
||||
Value::Entry(element) => vec![element],
|
||||
Value::Index(idx, len) => {
|
||||
let start = idx as usize;
|
||||
let end = start + len as usize;
|
||||
|
||||
crate::weights::EXPLICIT_WEIGHTS[start..end]
|
||||
.iter()
|
||||
.map(|(l1, l2, l3, l4, variable)| Element {
|
||||
l1: *l1,
|
||||
l2: *l2,
|
||||
l3: *l3,
|
||||
l4: *l4,
|
||||
variable: *variable,
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
},
|
||||
idx,
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Debug)]
|
||||
pub struct Element {
|
||||
pub l1: u16,
|
||||
pub l2: u16,
|
||||
pub l3: u8,
|
||||
pub l4: u16,
|
||||
pub variable: bool,
|
||||
}
|
||||
|
||||
impl Display for Element {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{}{:04X}.{:04X}.{:04X}.{:04X}",
|
||||
if self.variable { "*" } else { "." },
|
||||
self.l1,
|
||||
self.l2,
|
||||
self.l3,
|
||||
self.l4
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Debug)]
|
||||
enum Value {
|
||||
Entry(Element),
|
||||
Index(u32, u8),
|
||||
}
|
||||
|
||||
impl Value {
|
||||
fn to_u64(self) -> u64 {
|
||||
match self {
|
||||
Self::Entry(element) => {
|
||||
((element.l4 as u64) << 42)
|
||||
| ((element.l3 as u64) << 34)
|
||||
| ((element.l2 as u64) << 18)
|
||||
| ((element.l1 as u64) << 2)
|
||||
| (if element.variable { 1 } else { 0 } << 1)
|
||||
}
|
||||
Self::Index(idx, len) => ((idx as u64) << 9) | ((len as u64) << 1) | 1,
|
||||
}
|
||||
}
|
||||
|
||||
fn from_u64(data: u64) -> Self {
|
||||
if (data & 1) == 0 {
|
||||
let variable = ((data >> 1) & 1) == 1;
|
||||
|
||||
let l1 = ((data >> 2) & 0xFFFF) as u16;
|
||||
let l2 = ((data >> 18) & 0xFFFF) as u16;
|
||||
let l3 = ((data >> 34) & 0xFF) as u8;
|
||||
let l4 = ((data >> 42) & 0xFFFF) as u16;
|
||||
|
||||
Self::Entry(Element {
|
||||
l1,
|
||||
l2,
|
||||
l3,
|
||||
l4,
|
||||
variable,
|
||||
})
|
||||
} else {
|
||||
let len = ((data >> 1) & 0xFF) as u8;
|
||||
let idx = ((data >> 9) & 0xFFFFFFFF) as u32;
|
||||
|
||||
Self::Index(idx, len)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use proptest::prelude::*;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn value_strategy() -> impl Strategy<Value = Value> {
|
||||
prop_oneof![
|
||||
(any::<u32>(), any::<u8>()).prop_map(|(idx, len)| Value::Index(idx, len)),
|
||||
(
|
||||
any::<u16>(),
|
||||
any::<u16>(),
|
||||
any::<u8>(),
|
||||
any::<u16>(),
|
||||
any::<bool>()
|
||||
)
|
||||
.prop_map(|(l1, l2, l3, l4, variable)| Value::Entry(Element {
|
||||
l1,
|
||||
l2,
|
||||
l3,
|
||||
l4,
|
||||
variable
|
||||
})),
|
||||
]
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn proptest_serialize_and_deserialize(a in value_strategy()) {
|
||||
let data = a.to_u64();
|
||||
let b = Value::from_u64(data);
|
||||
|
||||
prop_assert_eq!(a, b);
|
||||
}
|
||||
}
|
||||
}
|
||||
2
crates/smol-uca/src/weights.rs
Normal file
2
crates/smol-uca/src/weights.rs
Normal file
@@ -0,0 +1,2 @@
|
||||
include!(concat!(env!("OUT_DIR"), "/explicit_weights.rs"));
|
||||
include!(concat!(env!("OUT_DIR"), "/implicit_weights.rs"));
|
||||
91
crates/smol-uca/tests/collation_test.rs
Normal file
91
crates/smol-uca/tests/collation_test.rs
Normal file
@@ -0,0 +1,91 @@
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
|
||||
use smol_uca::collator::Collator;
|
||||
|
||||
#[test]
|
||||
fn collation_test_non_ignorable() {
|
||||
let data = File::open("data/CollationTest_NON_IGNORABLE.txt")
|
||||
.map(BufReader::new)
|
||||
.expect("collation test data");
|
||||
|
||||
let collator = Collator::default();
|
||||
|
||||
let mut prev_sort_key = None;
|
||||
|
||||
let mut order_errors = 0;
|
||||
let mut sort_key_errors = 0;
|
||||
|
||||
'line: for (n, line) in data.lines().enumerate() {
|
||||
let line = line.expect("line");
|
||||
|
||||
let line = line.trim_start();
|
||||
|
||||
if line.is_empty() || line.starts_with('#') {
|
||||
continue;
|
||||
}
|
||||
|
||||
let (chars, rest) = line
|
||||
.split_once(';')
|
||||
.expect("a semicolon separated test line");
|
||||
|
||||
let mut surrogates = false;
|
||||
let test_string = chars
|
||||
.trim()
|
||||
.split(' ')
|
||||
.map(|x| u32::from_str_radix(x, 16).expect("a valid hex value"))
|
||||
.map(|x| match char::from_u32(x) {
|
||||
Some(ch) => ch,
|
||||
None => {
|
||||
if (0xD800u32..=0xDFFF).contains(&x) {
|
||||
surrogates = true;
|
||||
|
||||
' '
|
||||
} else {
|
||||
panic!("{}", line)
|
||||
}
|
||||
}
|
||||
})
|
||||
.collect::<String>();
|
||||
|
||||
if surrogates {
|
||||
continue 'line;
|
||||
}
|
||||
|
||||
let expected_sort_key = rest.rsplit(['[', ']']).nth(1).expect("sort key");
|
||||
|
||||
let sort_key = collator.sort_key(&test_string);
|
||||
let fmt_sort_key = smol_uca::fmt(&sort_key);
|
||||
|
||||
if let Some(prev_sort_key) = prev_sort_key.take() {
|
||||
if sort_key < prev_sort_key {
|
||||
eprintln!(
|
||||
"Error at line {}: {:?} [{}]",
|
||||
n + 1,
|
||||
test_string,
|
||||
expected_sort_key
|
||||
);
|
||||
|
||||
order_errors += 1;
|
||||
}
|
||||
}
|
||||
|
||||
prev_sort_key = Some(sort_key);
|
||||
|
||||
if fmt_sort_key != expected_sort_key {
|
||||
eprintln!(
|
||||
"Error at line {}: {:?} expected: [{}], got: [{}] ({})",
|
||||
n + 1,
|
||||
unf::nfd(&test_string).collect::<String>(),
|
||||
expected_sort_key,
|
||||
fmt_sort_key,
|
||||
line
|
||||
);
|
||||
|
||||
sort_key_errors += 1;
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(order_errors, 0);
|
||||
assert_eq!(sort_key_errors, 0);
|
||||
}
|
||||
14
crates/ufst/.gitignore
vendored
Normal file
14
crates/ufst/.gitignore
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
.*.swp
|
||||
tags
|
||||
target
|
||||
*.lock
|
||||
tmp
|
||||
*.csv
|
||||
*.fst
|
||||
*-got
|
||||
*.csv.idx
|
||||
words
|
||||
98m*
|
||||
dict
|
||||
test
|
||||
months
|
||||
3
crates/ufst/COPYING
Normal file
3
crates/ufst/COPYING
Normal file
@@ -0,0 +1,3 @@
|
||||
This project is dual-licensed under the Unlicense and MIT licenses.
|
||||
|
||||
You may use this code under the terms of either license.
|
||||
17
crates/ufst/Cargo.toml
Normal file
17
crates/ufst/Cargo.toml
Normal file
@@ -0,0 +1,17 @@
|
||||
[package]
|
||||
name = "ufst"
|
||||
version = "0.4.7" #:version
|
||||
authors = ["Andrew Gallant <jamslam@gmail.com>", "Anders Olsson <anders.e.olsson@gmail.com>"]
|
||||
description = """
|
||||
Use finite state transducers to compactly represents sets or maps of many
|
||||
strings (> 1 billion is possible).
|
||||
"""
|
||||
license = "Unlicense/MIT"
|
||||
edition = "2021"
|
||||
|
||||
[dev-dependencies]
|
||||
doc-comment = "0.3.1"
|
||||
fnv = "1.0.6"
|
||||
memmap = "0.7"
|
||||
quickcheck = { version = "0.9.2", default-features = false }
|
||||
rand = "0.7.3"
|
||||
21
crates/ufst/LICENSE-MIT
Normal file
21
crates/ufst/LICENSE-MIT
Normal file
@@ -0,0 +1,21 @@
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Andrew Gallant
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
76
crates/ufst/README.md
Normal file
76
crates/ufst/README.md
Normal file
@@ -0,0 +1,76 @@
|
||||
fst
|
||||
===
|
||||
This crate provides a fast implementation of ordered sets and maps using finite
|
||||
state machines. In particular, it makes use of finite state transducers to map
|
||||
keys to values as the machine is executed. Using finite state machines as data
|
||||
structures enables us to store keys in a compact format that is also easily
|
||||
searchable. For example, this crate leverages memory maps to make range queries
|
||||
very fast.
|
||||
|
||||
Check out my blog post
|
||||
[Index 1,600,000,000 Keys with Automata and
|
||||
Rust](https://blog.burntsushi.net/transducers/)
|
||||
for extensive background, examples and experiments.
|
||||
|
||||
[](https://github.com/BurntSushi/fst/actions)
|
||||
[](https://crates.io/crates/fst)
|
||||
|
||||
Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/).
|
||||
|
||||
|
||||
### Documentation
|
||||
|
||||
https://docs.rs/fst
|
||||
|
||||
The
|
||||
[`regex-automata`](https://docs.rs/regex-automata)
|
||||
crate provides implementations of the `fst::Automata` trait when its
|
||||
`transducer` feature is enabled. This permits using DFAs compiled by
|
||||
`regex-automata` to search finite state transducers produced by this crate.
|
||||
|
||||
|
||||
### Installation
|
||||
|
||||
Simply add a corresponding entry to your `Cargo.toml` dependency list:
|
||||
|
||||
```toml,ignore
|
||||
[dependencies]
|
||||
fst = "0.4"
|
||||
```
|
||||
|
||||
|
||||
### Example
|
||||
|
||||
This example demonstrates building a set in memory and executing a fuzzy query
|
||||
against it. You'll need `fst = "0.4"` with the `levenshtein` feature enabled in
|
||||
your `Cargo.toml`.
|
||||
|
||||
```rust
|
||||
use fst::{IntoStreamer, Set};
|
||||
use fst::automaton::Levenshtein;
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// A convenient way to create sets in memory.
|
||||
let keys = vec!["fa", "fo", "fob", "focus", "foo", "food", "foul"];
|
||||
let set = Set::from_iter(keys)?;
|
||||
|
||||
// Build our fuzzy query.
|
||||
let lev = Levenshtein::new("foo", 1)?;
|
||||
|
||||
// Apply our fuzzy query to the set we built.
|
||||
let stream = set.search(lev).into_stream();
|
||||
|
||||
let keys = stream.into_strs()?;
|
||||
assert_eq!(keys, vec!["fo", "fob", "foo", "food"]);
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
Check out the documentation for a lot more examples!
|
||||
|
||||
|
||||
### Cargo features
|
||||
|
||||
* `levenshtein` - **Disabled** by default. This adds the `Levenshtein`
|
||||
automaton to the `automaton` sub-module. This includes an additional
|
||||
dependency on `utf8-ranges`.
|
||||
24
crates/ufst/UNLICENSE
Normal file
24
crates/ufst/UNLICENSE
Normal file
@@ -0,0 +1,24 @@
|
||||
This is free and unencumbered software released into the public domain.
|
||||
|
||||
Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||
distribute this software, either in source code form or as a compiled
|
||||
binary, for any purpose, commercial or non-commercial, and by any
|
||||
means.
|
||||
|
||||
In jurisdictions that recognize copyright laws, the author or authors
|
||||
of this software dedicate any and all copyright interest in the
|
||||
software to the public domain. We make this dedication for the benefit
|
||||
of the public at large and to the detriment of our heirs and
|
||||
successors. We intend this dedication to be an overt act of
|
||||
relinquishment in perpetuity of all present and future rights to this
|
||||
software under copyright law.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
For more information, please refer to <http://unlicense.org/>
|
||||
124
crates/ufst/build.rs
Normal file
124
crates/ufst/build.rs
Normal file
@@ -0,0 +1,124 @@
|
||||
use std::env;
|
||||
use std::fs::File;
|
||||
use std::io::{self, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
const CASTAGNOLI_POLY: u32 = 0x82f63b78;
|
||||
|
||||
type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
|
||||
|
||||
fn main() {
|
||||
if let Err(err) = try_main() {
|
||||
panic!("{}", err);
|
||||
}
|
||||
}
|
||||
|
||||
fn try_main() -> Result<()> {
|
||||
let out_dir = match env::var_os("OUT_DIR") {
|
||||
None => {
|
||||
return Err(From::from("OUT_DIR environment variable not defined"))
|
||||
}
|
||||
Some(out_dir) => PathBuf::from(out_dir),
|
||||
};
|
||||
write_tag_lookup_table(&out_dir)?;
|
||||
write_crc_tables(&out_dir)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_tag_lookup_table(out_dir: &Path) -> Result<()> {
|
||||
let out_path = out_dir.join("tag.rs");
|
||||
let mut out = io::BufWriter::new(File::create(out_path)?);
|
||||
|
||||
writeln!(out, "pub const TAG_LOOKUP_TABLE: [u16; 256] = [")?;
|
||||
for b in 0u8..=255 {
|
||||
writeln!(out, " {},", tag_entry(b))?;
|
||||
}
|
||||
writeln!(out, "];")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn tag_entry(b: u8) -> u16 {
|
||||
let b = b as u16;
|
||||
match b & 0b00000011 {
|
||||
0b00 => {
|
||||
let lit_len = (b >> 2) + 1;
|
||||
if lit_len <= 60 {
|
||||
lit_len
|
||||
} else {
|
||||
assert!(lit_len <= 64);
|
||||
(lit_len - 60) << 11
|
||||
}
|
||||
}
|
||||
0b01 => {
|
||||
let len = 4 + ((b >> 2) & 0b111);
|
||||
let offset = (b >> 5) & 0b111;
|
||||
(1 << 11) | (offset << 8) | len
|
||||
}
|
||||
0b10 => {
|
||||
let len = 1 + (b >> 2);
|
||||
(2 << 11) | len
|
||||
}
|
||||
0b11 => {
|
||||
let len = 1 + (b >> 2);
|
||||
(4 << 11) | len
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
fn write_crc_tables(out_dir: &Path) -> Result<()> {
|
||||
let out_path = out_dir.join("crc32_table.rs");
|
||||
let mut out = io::BufWriter::new(File::create(out_path)?);
|
||||
|
||||
let table = make_table(CASTAGNOLI_POLY);
|
||||
let table16 = make_table16(CASTAGNOLI_POLY);
|
||||
|
||||
writeln!(out, "pub const TABLE: [u32; 256] = [")?;
|
||||
for &x in table.iter() {
|
||||
writeln!(out, " {},", x)?;
|
||||
}
|
||||
writeln!(out, "];\n")?;
|
||||
|
||||
writeln!(out, "pub const TABLE16: [[u32; 256]; 16] = [")?;
|
||||
for table in table16.iter() {
|
||||
writeln!(out, " [")?;
|
||||
for &x in table.iter() {
|
||||
writeln!(out, " {},", x)?;
|
||||
}
|
||||
writeln!(out, " ],")?;
|
||||
}
|
||||
writeln!(out, "];")?;
|
||||
|
||||
out.flush()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn make_table16(poly: u32) -> [[u32; 256]; 16] {
|
||||
let mut tab = [[0; 256]; 16];
|
||||
tab[0] = make_table(poly);
|
||||
for i in 0..256 {
|
||||
let mut crc = tab[0][i];
|
||||
for j in 1..16 {
|
||||
crc = (crc >> 8) ^ tab[0][crc as u8 as usize];
|
||||
tab[j][i] = crc;
|
||||
}
|
||||
}
|
||||
tab
|
||||
}
|
||||
|
||||
fn make_table(poly: u32) -> [u32; 256] {
|
||||
let mut tab = [0; 256];
|
||||
for i in 0u32..256u32 {
|
||||
let mut crc = i;
|
||||
for _ in 0..8 {
|
||||
if crc & 1 == 1 {
|
||||
crc = (crc >> 1) ^ poly;
|
||||
} else {
|
||||
crc >>= 1;
|
||||
}
|
||||
}
|
||||
tab[i as usize] = crc;
|
||||
}
|
||||
tab
|
||||
}
|
||||
10000
crates/ufst/data/wiki-urls-10000
Normal file
10000
crates/ufst/data/wiki-urls-10000
Normal file
File diff suppressed because it is too large
Load Diff
97054
crates/ufst/data/wiki-urls-100000
Normal file
97054
crates/ufst/data/wiki-urls-100000
Normal file
File diff suppressed because it is too large
Load Diff
10000
crates/ufst/data/words-10000
Normal file
10000
crates/ufst/data/words-10000
Normal file
File diff suppressed because it is too large
Load Diff
100001
crates/ufst/data/words-100000
Normal file
100001
crates/ufst/data/words-100000
Normal file
File diff suppressed because it is too large
Load Diff
2
crates/ufst/rustfmt.toml
Normal file
2
crates/ufst/rustfmt.toml
Normal file
@@ -0,0 +1,2 @@
|
||||
max_width = 79
|
||||
use_small_heuristics = "max"
|
||||
39
crates/ufst/scripts/gen-common-inputs
Executable file
39
crates/ufst/scripts/gen-common-inputs
Executable file
@@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
import codecs
|
||||
from operator import itemgetter
|
||||
import sys
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Get frequency counts of each byte.
|
||||
freqs = [0] * 256 # byte |--> frequency
|
||||
for fpath in sys.argv[1:]:
|
||||
with codecs.open(fpath, 'r', 'utf-8') as fin:
|
||||
for line in fin:
|
||||
for byte in line.strip().encode('utf-8'):
|
||||
freqs[byte] += 1
|
||||
|
||||
# Create the inverse mapping.
|
||||
orders = [0] * 256 # byte |--> sort index, descending
|
||||
sort_by_freq = sorted(zip(range(256), freqs),
|
||||
key=itemgetter(1), reverse=True)
|
||||
for sort_idx, byte in enumerate(map(itemgetter(0), sort_by_freq)):
|
||||
orders[byte] = sort_idx
|
||||
|
||||
# Now write Rust.
|
||||
olines = ['pub const COMMON_INPUTS: [u8; 256] = [']
|
||||
for byte in range(256):
|
||||
olines.append(' %3d, // %r' % (orders[byte], chr(byte)))
|
||||
olines.append('];')
|
||||
olines.append('')
|
||||
olines.append('pub const COMMON_INPUTS_INV: [u8; 256] = [')
|
||||
for sort_idx in range(256):
|
||||
byte = orders.index(sort_idx)
|
||||
if byte <= 127:
|
||||
olines.append(' b%r,' % chr(byte))
|
||||
else:
|
||||
olines.append(" b'\\x%x'," % byte)
|
||||
olines.append('];')
|
||||
print('\n'.join(olines))
|
||||
440
crates/ufst/src/automaton/mod.rs
Normal file
440
crates/ufst/src/automaton/mod.rs
Normal file
@@ -0,0 +1,440 @@
|
||||
/// Automaton describes types that behave as a finite automaton.
|
||||
///
|
||||
/// All implementors of this trait are represented by *byte based* automata.
|
||||
/// Stated differently, all transitions in the automata correspond to a single
|
||||
/// byte in the input.
|
||||
///
|
||||
/// This implementation choice is important for a couple reasons:
|
||||
///
|
||||
/// 1. The set of possible transitions in each node is small, which may make
|
||||
/// efficient memory usage easier.
|
||||
/// 2. The finite state transducers in this crate are all byte based, so any
|
||||
/// automata used on them must also be byte based.
|
||||
///
|
||||
/// In practice, this does present somewhat of a problem, for example, if
|
||||
/// you're storing UTF-8 encoded strings in a finite state transducer. Consider
|
||||
/// using a `Levenshtein` automaton, which accepts a query string and an edit
|
||||
/// distance. The edit distance should apply to some notion of *character*,
|
||||
/// which could be represented by at least 1-4 bytes in a UTF-8 encoding (for
|
||||
/// some definition of "character"). Therefore, the automaton must have UTF-8
|
||||
/// decoding built into it. This can be tricky to implement, so you may find
|
||||
/// the [`utf8-ranges`](https://crates.io/crates/utf8-ranges) crate useful.
|
||||
pub trait Automaton {
|
||||
/// The type of the state used in the automaton.
|
||||
type State;
|
||||
|
||||
/// Returns a single start state for this automaton.
|
||||
///
|
||||
/// This method should always return the same value for each
|
||||
/// implementation.
|
||||
fn start(&self) -> Self::State;
|
||||
|
||||
/// Returns true if and only if `state` is a match state.
|
||||
fn is_match(&self, state: &Self::State) -> bool;
|
||||
|
||||
/// Returns true if and only if `state` can lead to a match in zero or more
|
||||
/// steps.
|
||||
///
|
||||
/// If this returns `false`, then no sequence of inputs from this state
|
||||
/// should ever produce a match. If this does not follow, then those match
|
||||
/// states may never be reached. In other words, behavior may be incorrect.
|
||||
///
|
||||
/// If this returns `true` even when no match is possible, then behavior
|
||||
/// will be correct, but callers may be forced to do additional work.
|
||||
fn can_match(&self, _state: &Self::State) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
/// Returns true if and only if `state` matches and must match no matter
|
||||
/// what steps are taken.
|
||||
///
|
||||
/// If this returns `true`, then every sequence of inputs from this state
|
||||
/// produces a match. If this does not follow, then those match states may
|
||||
/// never be reached. In other words, behavior may be incorrect.
|
||||
///
|
||||
/// If this returns `false` even when every sequence of inputs will lead to
|
||||
/// a match, then behavior will be correct, but callers may be forced to do
|
||||
/// additional work.
|
||||
fn will_always_match(&self, _state: &Self::State) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
/// Return the next state given `state` and an input.
|
||||
fn accept(&self, state: &Self::State, byte: u8) -> Self::State;
|
||||
|
||||
/// If applicable, return the next state when the end of a key is seen.
|
||||
fn accept_eof(&self, _: &Self::State) -> Option<Self::State> {
|
||||
None
|
||||
}
|
||||
|
||||
/// Returns an automaton that matches the strings that start with something
|
||||
/// this automaton matches.
|
||||
fn starts_with(self) -> StartsWith<Self>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
StartsWith(self)
|
||||
}
|
||||
|
||||
/// Returns an automaton that matches the strings matched by either this or
|
||||
/// the other automaton.
|
||||
fn union<Rhs: Automaton>(self, rhs: Rhs) -> Union<Self, Rhs>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
Union(self, rhs)
|
||||
}
|
||||
|
||||
/// Returns an automaton that matches the strings matched by both this and
|
||||
/// the other automaton.
|
||||
fn intersection<Rhs: Automaton>(self, rhs: Rhs) -> Intersection<Self, Rhs>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
Intersection(self, rhs)
|
||||
}
|
||||
|
||||
/// Returns an automaton that matches the strings not matched by this
|
||||
/// automaton.
|
||||
fn complement(self) -> Complement<Self>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
Complement(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: Automaton> Automaton for &'a T {
|
||||
type State = T::State;
|
||||
|
||||
fn start(&self) -> T::State {
|
||||
(*self).start()
|
||||
}
|
||||
|
||||
fn is_match(&self, state: &T::State) -> bool {
|
||||
(*self).is_match(state)
|
||||
}
|
||||
|
||||
fn can_match(&self, state: &T::State) -> bool {
|
||||
(*self).can_match(state)
|
||||
}
|
||||
|
||||
fn will_always_match(&self, state: &T::State) -> bool {
|
||||
(*self).will_always_match(state)
|
||||
}
|
||||
|
||||
fn accept(&self, state: &T::State, byte: u8) -> T::State {
|
||||
(*self).accept(state, byte)
|
||||
}
|
||||
|
||||
fn accept_eof(&self, state: &Self::State) -> Option<Self::State> {
|
||||
(*self).accept_eof(state)
|
||||
}
|
||||
}
|
||||
|
||||
/// An automaton that matches if the input equals to a specific string.
|
||||
///
|
||||
/// It can be used in combination with [`StartsWith`] to search strings
|
||||
/// starting with a given prefix.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Str<'a> {
|
||||
string: &'a [u8],
|
||||
}
|
||||
|
||||
impl<'a> Str<'a> {
|
||||
/// Constructs automaton that matches an exact string.
|
||||
#[inline]
|
||||
pub fn new(string: &'a str) -> Str<'a> {
|
||||
Str { string: string.as_bytes() }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Automaton for Str<'a> {
|
||||
type State = Option<usize>;
|
||||
|
||||
#[inline]
|
||||
fn start(&self) -> Option<usize> {
|
||||
Some(0)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_match(&self, pos: &Option<usize>) -> bool {
|
||||
*pos == Some(self.string.len())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn can_match(&self, pos: &Option<usize>) -> bool {
|
||||
pos.is_some()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn accept(&self, pos: &Option<usize>, byte: u8) -> Option<usize> {
|
||||
// if we aren't already past the end...
|
||||
if let Some(pos) = *pos {
|
||||
// and there is still a matching byte at the current position...
|
||||
if self.string.get(pos).cloned() == Some(byte) {
|
||||
// then move forward
|
||||
return Some(pos + 1);
|
||||
}
|
||||
}
|
||||
// otherwise we're either past the end or didn't match the byte
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// An automaton that matches if the input contains a specific subsequence.
|
||||
///
|
||||
/// It can be used to build a simple fuzzy-finder.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Subsequence<'a> {
|
||||
subseq: &'a [u8],
|
||||
}
|
||||
|
||||
impl<'a> Subsequence<'a> {
|
||||
/// Constructs automaton that matches input containing the
|
||||
/// specified subsequence.
|
||||
#[inline]
|
||||
pub fn new(subsequence: &'a str) -> Subsequence<'a> {
|
||||
Subsequence { subseq: subsequence.as_bytes() }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Automaton for Subsequence<'a> {
|
||||
type State = usize;
|
||||
|
||||
#[inline]
|
||||
fn start(&self) -> usize {
|
||||
0
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_match(&self, &state: &usize) -> bool {
|
||||
state == self.subseq.len()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn can_match(&self, _: &usize) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn will_always_match(&self, &state: &usize) -> bool {
|
||||
state == self.subseq.len()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn accept(&self, &state: &usize, byte: u8) -> usize {
|
||||
if state == self.subseq.len() {
|
||||
return state;
|
||||
}
|
||||
state + (byte == self.subseq[state]) as usize
|
||||
}
|
||||
}
|
||||
|
||||
/// An automaton that always matches.
|
||||
///
|
||||
/// This is useful in a generic context as a way to express that no automaton
|
||||
/// should be used.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct AlwaysMatch;
|
||||
|
||||
impl Automaton for AlwaysMatch {
|
||||
type State = ();
|
||||
|
||||
#[inline]
|
||||
fn start(&self) {}
|
||||
|
||||
#[inline]
|
||||
fn is_match(&self, _: &()) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn can_match(&self, _: &()) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn will_always_match(&self, _: &()) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn accept(&self, _: &(), _: u8) {}
|
||||
}
|
||||
|
||||
/// An automaton that matches a string that begins with something that the
|
||||
/// wrapped automaton matches.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct StartsWith<A>(A);
|
||||
|
||||
/// The `Automaton` state for `StartsWith<A>`.
|
||||
pub struct StartsWithState<A: Automaton>(StartsWithStateKind<A>);
|
||||
|
||||
enum StartsWithStateKind<A: Automaton> {
|
||||
Done,
|
||||
Running(A::State),
|
||||
}
|
||||
|
||||
impl<A: Automaton> Automaton for StartsWith<A> {
|
||||
type State = StartsWithState<A>;
|
||||
|
||||
fn start(&self) -> StartsWithState<A> {
|
||||
StartsWithState({
|
||||
let inner = self.0.start();
|
||||
if self.0.is_match(&inner) {
|
||||
StartsWithStateKind::Done
|
||||
} else {
|
||||
StartsWithStateKind::Running(inner)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn is_match(&self, state: &StartsWithState<A>) -> bool {
|
||||
match state.0 {
|
||||
StartsWithStateKind::Done => true,
|
||||
StartsWithStateKind::Running(_) => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn can_match(&self, state: &StartsWithState<A>) -> bool {
|
||||
match state.0 {
|
||||
StartsWithStateKind::Done => true,
|
||||
StartsWithStateKind::Running(ref inner) => self.0.can_match(inner),
|
||||
}
|
||||
}
|
||||
|
||||
fn will_always_match(&self, state: &StartsWithState<A>) -> bool {
|
||||
match state.0 {
|
||||
StartsWithStateKind::Done => true,
|
||||
StartsWithStateKind::Running(_) => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn accept(
|
||||
&self,
|
||||
state: &StartsWithState<A>,
|
||||
byte: u8,
|
||||
) -> StartsWithState<A> {
|
||||
StartsWithState(match state.0 {
|
||||
StartsWithStateKind::Done => StartsWithStateKind::Done,
|
||||
StartsWithStateKind::Running(ref inner) => {
|
||||
let next_inner = self.0.accept(inner, byte);
|
||||
if self.0.is_match(&next_inner) {
|
||||
StartsWithStateKind::Done
|
||||
} else {
|
||||
StartsWithStateKind::Running(next_inner)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// An automaton that matches when one of its component automata match.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Union<A, B>(A, B);
|
||||
|
||||
/// The `Automaton` state for `Union<A, B>`.
|
||||
pub struct UnionState<A: Automaton, B: Automaton>(A::State, B::State);
|
||||
|
||||
impl<A: Automaton, B: Automaton> Automaton for Union<A, B> {
|
||||
type State = UnionState<A, B>;
|
||||
|
||||
fn start(&self) -> UnionState<A, B> {
|
||||
UnionState(self.0.start(), self.1.start())
|
||||
}
|
||||
|
||||
fn is_match(&self, state: &UnionState<A, B>) -> bool {
|
||||
self.0.is_match(&state.0) || self.1.is_match(&state.1)
|
||||
}
|
||||
|
||||
fn can_match(&self, state: &UnionState<A, B>) -> bool {
|
||||
self.0.can_match(&state.0) || self.1.can_match(&state.1)
|
||||
}
|
||||
|
||||
fn will_always_match(&self, state: &UnionState<A, B>) -> bool {
|
||||
self.0.will_always_match(&state.0)
|
||||
|| self.1.will_always_match(&state.1)
|
||||
}
|
||||
|
||||
fn accept(&self, state: &UnionState<A, B>, byte: u8) -> UnionState<A, B> {
|
||||
UnionState(
|
||||
self.0.accept(&state.0, byte),
|
||||
self.1.accept(&state.1, byte),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// An automaton that matches when both of its component automata match.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Intersection<A, B>(A, B);
|
||||
|
||||
/// The `Automaton` state for `Intersection<A, B>`.
|
||||
pub struct IntersectionState<A: Automaton, B: Automaton>(A::State, B::State);
|
||||
|
||||
impl<A: Automaton, B: Automaton> Automaton for Intersection<A, B> {
|
||||
type State = IntersectionState<A, B>;
|
||||
|
||||
fn start(&self) -> IntersectionState<A, B> {
|
||||
IntersectionState(self.0.start(), self.1.start())
|
||||
}
|
||||
|
||||
fn is_match(&self, state: &IntersectionState<A, B>) -> bool {
|
||||
self.0.is_match(&state.0) && self.1.is_match(&state.1)
|
||||
}
|
||||
|
||||
fn can_match(&self, state: &IntersectionState<A, B>) -> bool {
|
||||
self.0.can_match(&state.0) && self.1.can_match(&state.1)
|
||||
}
|
||||
|
||||
fn will_always_match(&self, state: &IntersectionState<A, B>) -> bool {
|
||||
self.0.will_always_match(&state.0)
|
||||
&& self.1.will_always_match(&state.1)
|
||||
}
|
||||
|
||||
fn accept(
|
||||
&self,
|
||||
state: &IntersectionState<A, B>,
|
||||
byte: u8,
|
||||
) -> IntersectionState<A, B> {
|
||||
IntersectionState(
|
||||
self.0.accept(&state.0, byte),
|
||||
self.1.accept(&state.1, byte),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// An automaton that matches exactly when the automaton it wraps does not.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Complement<A>(A);
|
||||
|
||||
/// The `Automaton` state for `Complement<A>`.
|
||||
pub struct ComplementState<A: Automaton>(A::State);
|
||||
|
||||
impl<A: Automaton> Automaton for Complement<A> {
|
||||
type State = ComplementState<A>;
|
||||
|
||||
fn start(&self) -> ComplementState<A> {
|
||||
ComplementState(self.0.start())
|
||||
}
|
||||
|
||||
fn is_match(&self, state: &ComplementState<A>) -> bool {
|
||||
!self.0.is_match(&state.0)
|
||||
}
|
||||
|
||||
fn can_match(&self, state: &ComplementState<A>) -> bool {
|
||||
!self.0.will_always_match(&state.0)
|
||||
}
|
||||
|
||||
fn will_always_match(&self, state: &ComplementState<A>) -> bool {
|
||||
!self.0.can_match(&state.0)
|
||||
}
|
||||
|
||||
fn accept(
|
||||
&self,
|
||||
state: &ComplementState<A>,
|
||||
byte: u8,
|
||||
) -> ComplementState<A> {
|
||||
ComplementState(self.0.accept(&state.0, byte))
|
||||
}
|
||||
}
|
||||
121
crates/ufst/src/bytes.rs
Normal file
121
crates/ufst/src/bytes.rs
Normal file
@@ -0,0 +1,121 @@
|
||||
use std::convert::TryInto;
|
||||
use std::io;
|
||||
|
||||
/// Read a u64 in little endian format from the beginning of the given slice.
|
||||
/// This panics if the slice has length less than 8.
|
||||
#[inline]
|
||||
pub fn read_u64_le(slice: &[u8]) -> u64 {
|
||||
u64::from_le_bytes(slice[..8].try_into().unwrap())
|
||||
}
|
||||
|
||||
/// Write a u64 in little endian format to the beginning of the given slice.
|
||||
/// This panics if the slice has length less than 8.
|
||||
#[inline]
|
||||
pub fn write_u64_le(n: u64, slice: &mut [u8]) {
|
||||
assert!(slice.len() >= 8);
|
||||
let bytes = n.to_le_bytes();
|
||||
slice[0] = bytes[0];
|
||||
slice[1] = bytes[1];
|
||||
slice[2] = bytes[2];
|
||||
slice[3] = bytes[3];
|
||||
slice[4] = bytes[4];
|
||||
slice[5] = bytes[5];
|
||||
slice[6] = bytes[6];
|
||||
slice[7] = bytes[7];
|
||||
}
|
||||
|
||||
/// Like write_u64_le, but to an io::Write implementation. If every byte could
|
||||
/// not be writen, then this returns an error.
|
||||
#[inline]
|
||||
pub fn io_write_u64_le<W: io::Write>(n: u64, mut wtr: W) -> io::Result<()> {
|
||||
let mut buf = [0; 8];
|
||||
write_u64_le(n, &mut buf);
|
||||
wtr.write_all(&buf)
|
||||
}
|
||||
|
||||
/// pack_uint packs the given integer in the smallest number of bytes possible,
|
||||
/// and writes it to the given writer. The number of bytes written is returned
|
||||
/// on success.
|
||||
#[inline]
|
||||
pub fn pack_uint<W: io::Write>(wtr: W, n: u64) -> io::Result<u8> {
|
||||
let nbytes = pack_size(n);
|
||||
pack_uint_in(wtr, n, nbytes).map(|_| nbytes)
|
||||
}
|
||||
|
||||
/// pack_uint_in is like pack_uint, but always uses the number of bytes given
|
||||
/// to pack the number given.
|
||||
///
|
||||
/// `nbytes` must be >= pack_size(n) and <= 8, where `pack_size(n)` is the
|
||||
/// smallest number of bytes that can store the integer given.
|
||||
#[inline]
|
||||
pub fn pack_uint_in<W: io::Write>(
|
||||
mut wtr: W,
|
||||
mut n: u64,
|
||||
nbytes: u8,
|
||||
) -> io::Result<()> {
|
||||
assert!((1..=8).contains(&nbytes));
|
||||
let mut buf = [0u8; 8];
|
||||
for i in 0..nbytes {
|
||||
buf[i as usize] = n as u8;
|
||||
n >>= 8;
|
||||
}
|
||||
wtr.write_all(&buf[..nbytes as usize])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// unpack_uint is the dual of pack_uint. It unpacks the integer at the current
|
||||
/// position in `slice` after reading `nbytes` bytes.
|
||||
///
|
||||
/// `nbytes` must be >= 1 and <= 8.
|
||||
#[inline]
|
||||
pub fn unpack_uint(slice: &[u8], nbytes: u8) -> u64 {
|
||||
assert!((1..=8).contains(&nbytes));
|
||||
|
||||
let mut n = 0;
|
||||
for (i, &b) in slice[..nbytes as usize].iter().enumerate() {
|
||||
n |= (b as u64) << (8 * i);
|
||||
}
|
||||
n
|
||||
}
|
||||
|
||||
/// pack_size returns the smallest number of bytes that can encode `n`.
|
||||
#[inline]
|
||||
pub fn pack_size(n: u64) -> u8 {
|
||||
if n < 1 << 8 {
|
||||
1
|
||||
} else if n < 1 << 16 {
|
||||
2
|
||||
} else if n < 1 << 24 {
|
||||
3
|
||||
} else if n < 1 << 32 {
|
||||
4
|
||||
} else if n < 1 << 40 {
|
||||
5
|
||||
} else if n < 1 << 48 {
|
||||
6
|
||||
} else if n < 1 << 56 {
|
||||
7
|
||||
} else {
|
||||
8
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use quickcheck::{QuickCheck, StdGen};
|
||||
use std::io;
|
||||
|
||||
#[test]
|
||||
fn prop_pack_in_out() {
|
||||
fn p(num: u64) -> bool {
|
||||
let mut buf = io::Cursor::new(vec![]);
|
||||
let size = pack_uint(&mut buf, num).unwrap();
|
||||
buf.set_position(0);
|
||||
num == unpack_uint(buf.get_ref(), size)
|
||||
}
|
||||
QuickCheck::new()
|
||||
.gen(StdGen::new(::rand::thread_rng(), 257)) // pick byte boundary
|
||||
.quickcheck(p as fn(u64) -> bool);
|
||||
}
|
||||
}
|
||||
49
crates/ufst/src/error.rs
Normal file
49
crates/ufst/src/error.rs
Normal file
@@ -0,0 +1,49 @@
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
|
||||
use crate::raw;
|
||||
|
||||
/// A `Result` type alias for this crate's `Error` type.
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
/// An error that encapsulates all possible errors in this crate.
|
||||
#[derive(Debug)]
|
||||
pub enum Error {
|
||||
/// An error that occurred while reading or writing a finite state
|
||||
/// transducer.
|
||||
Fst(raw::Error),
|
||||
/// An IO error that occurred while writing a finite state transducer.
|
||||
Io(io::Error),
|
||||
}
|
||||
|
||||
impl From<io::Error> for Error {
|
||||
#[inline]
|
||||
fn from(err: io::Error) -> Error {
|
||||
Error::Io(err)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<raw::Error> for Error {
|
||||
#[inline]
|
||||
fn from(err: raw::Error) -> Error {
|
||||
Error::Fst(err)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match *self {
|
||||
Error::Fst(_) => write!(f, "FST error"),
|
||||
Error::Io(_) => write!(f, "I/O error"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for Error {
|
||||
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
|
||||
match *self {
|
||||
Error::Fst(ref err) => Some(err),
|
||||
Error::Io(ref err) => Some(err),
|
||||
}
|
||||
}
|
||||
}
|
||||
18
crates/ufst/src/lib.rs
Normal file
18
crates/ufst/src/lib.rs
Normal file
@@ -0,0 +1,18 @@
|
||||
pub use crate::automaton::Automaton;
|
||||
pub use crate::error::{Error, Result};
|
||||
pub use crate::stream::{IntoStreamer, Streamer};
|
||||
|
||||
mod bytes;
|
||||
mod error;
|
||||
#[path = "automaton/mod.rs"]
|
||||
mod inner_automaton;
|
||||
pub mod raw;
|
||||
mod stream;
|
||||
|
||||
/// Automaton implementations for finite state transducers.
|
||||
///
|
||||
/// This module defines a trait, `Automaton`, with several implementations
|
||||
/// including, but not limited to, union, intersection and complement.
|
||||
pub mod automaton {
|
||||
pub use crate::inner_automaton::*;
|
||||
}
|
||||
464
crates/ufst/src/raw/build.rs
Normal file
464
crates/ufst/src/raw/build.rs
Normal file
@@ -0,0 +1,464 @@
|
||||
use std::io;
|
||||
|
||||
use crate::bytes;
|
||||
use crate::error::Result;
|
||||
use crate::raw::counting_writer::CountingWriter;
|
||||
use crate::raw::error::Error;
|
||||
use crate::raw::registry::{Registry, RegistryEntry};
|
||||
use crate::raw::{
|
||||
CompiledAddr, Fst, Output, Transition, EMPTY_ADDRESS, NONE_ADDRESS,
|
||||
};
|
||||
// use raw::registry_minimal::{Registry, RegistryEntry};
|
||||
use crate::stream::{IntoStreamer, Streamer};
|
||||
|
||||
/// A builder for creating a finite state transducer.
|
||||
///
|
||||
/// This is not your average everyday builder. It has two important qualities
|
||||
/// that make it a bit unique from what you might expect:
|
||||
///
|
||||
/// 1. All keys must be added in lexicographic order. Adding a key out of order
|
||||
/// will result in an error. Additionally, adding a duplicate key with an
|
||||
/// output value will also result in an error. That is, once a key is
|
||||
/// associated with a value, that association can never be modified or
|
||||
/// deleted.
|
||||
/// 2. The representation of an fst is streamed to *any* `io::Write` as it is
|
||||
/// built. For an in memory representation, this can be a `Vec<u8>`.
|
||||
///
|
||||
/// Point (2) is especially important because it means that an fst can be
|
||||
/// constructed *without storing the entire fst in memory*. Namely, since it
|
||||
/// works with any `io::Write`, it can be streamed directly to a file.
|
||||
///
|
||||
/// With that said, the builder does use memory, but **memory usage is bounded
|
||||
/// to a constant size**. The amount of memory used trades off with the
|
||||
/// compression ratio. Currently, the implementation hard codes this trade off
|
||||
/// which can result in about 5-20MB of heap usage during construction. (N.B.
|
||||
/// Guaranteeing a maximal compression ratio requires memory proportional to
|
||||
/// the size of the fst, which defeats some of the benefit of streaming
|
||||
/// it to disk. In practice, a small bounded amount of memory achieves
|
||||
/// close-to-minimal compression ratios.)
|
||||
///
|
||||
/// The algorithmic complexity of fst construction is `O(n)` where `n` is the
|
||||
/// number of elements added to the fst.
|
||||
pub struct Builder<W> {
|
||||
/// The FST raw data is written directly to `wtr`.
|
||||
///
|
||||
/// No internal buffering is done.
|
||||
wtr: CountingWriter<W>,
|
||||
/// The stack of unfinished nodes.
|
||||
///
|
||||
/// An unfinished node is a node that could potentially have a new
|
||||
/// transition added to it when a new word is added to the dictionary.
|
||||
unfinished: UnfinishedNodes,
|
||||
/// A map of finished nodes.
|
||||
///
|
||||
/// A finished node is one that has been compiled and written to `wtr`.
|
||||
/// After this point, the node is considered immutable and will never
|
||||
/// change.
|
||||
registry: Registry,
|
||||
/// The last word added.
|
||||
///
|
||||
/// This is used to enforce the invariant that words are added in sorted
|
||||
/// order.
|
||||
last: Option<Vec<u8>>,
|
||||
/// The address of the last compiled node.
|
||||
///
|
||||
/// This is used to optimize states with one transition that point
|
||||
/// to the previously compiled node. (The previously compiled node in
|
||||
/// this case actually corresponds to the next state for the transition,
|
||||
/// since states are compiled in reverse.)
|
||||
last_addr: CompiledAddr,
|
||||
/// The number of keys added.
|
||||
len: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct UnfinishedNodes {
|
||||
stack: Vec<BuilderNodeUnfinished>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct BuilderNodeUnfinished {
|
||||
node: BuilderNode,
|
||||
last: Option<LastTransition>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Hash, Eq, PartialEq)]
|
||||
pub struct BuilderNode {
|
||||
pub is_final: bool,
|
||||
pub final_output: Output,
|
||||
pub trans: Vec<Transition>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct LastTransition {
|
||||
inp: u8,
|
||||
out: Output,
|
||||
}
|
||||
|
||||
impl Builder<Vec<u8>> {
|
||||
/// Create a builder that builds an fst in memory.
|
||||
#[inline]
|
||||
pub fn memory() -> Builder<Vec<u8>> {
|
||||
Builder::new(Vec::with_capacity(10 * (1 << 10))).unwrap()
|
||||
}
|
||||
|
||||
/// Finishes construction of the FST and returns it.
|
||||
#[inline]
|
||||
pub fn into_fst(self) -> Fst<Vec<u8>> {
|
||||
self.into_inner().and_then(Fst::new).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: io::Write> Builder<W> {
|
||||
/// Create a builder that builds an fst by writing it to `wtr` in a
|
||||
/// streaming fashion.
|
||||
pub fn new(wtr: W) -> Result<Builder<W>> {
|
||||
let wtr = CountingWriter::new(wtr);
|
||||
|
||||
Ok(Builder {
|
||||
wtr,
|
||||
unfinished: UnfinishedNodes::new(),
|
||||
registry: Registry::new(10_000, 2),
|
||||
last: None,
|
||||
last_addr: NONE_ADDRESS,
|
||||
len: 0,
|
||||
})
|
||||
}
|
||||
|
||||
/// Adds a byte string to this FST with a zero output value.
|
||||
pub fn add<B>(&mut self, bs: B) -> Result<()>
|
||||
where
|
||||
B: AsRef<[u8]>,
|
||||
{
|
||||
self.check_last_key(bs.as_ref(), false)?;
|
||||
self.insert_output(bs, None)
|
||||
}
|
||||
|
||||
/// Insert a new key-value pair into the fst.
|
||||
///
|
||||
/// Keys must be convertible to byte strings. Values must be a `u64`, which
|
||||
/// is a restriction of the current implementation of finite state
|
||||
/// transducers. (Values may one day be expanded to other types.)
|
||||
///
|
||||
/// If a key is inserted that is less than or equal to any previous key
|
||||
/// added, then an error is returned. Similarly, if there was a problem
|
||||
/// writing to the underlying writer, an error is returned.
|
||||
pub fn insert<B>(&mut self, bs: B, val: u64) -> Result<()>
|
||||
where
|
||||
B: AsRef<[u8]>,
|
||||
{
|
||||
self.check_last_key(bs.as_ref(), true)?;
|
||||
self.insert_output(bs, Some(Output::new(val)))
|
||||
}
|
||||
|
||||
/// Calls insert on each item in the iterator.
|
||||
///
|
||||
/// If an error occurred while adding an element, processing is stopped
|
||||
/// and the error is returned.
|
||||
///
|
||||
/// If a key is inserted that is less than or equal to any previous key
|
||||
/// added, then an error is returned. Similarly, if there was a problem
|
||||
/// writing to the underlying writer, an error is returned.
|
||||
pub fn extend_iter<T, I>(&mut self, iter: I) -> Result<()>
|
||||
where
|
||||
T: AsRef<[u8]>,
|
||||
I: IntoIterator<Item = (T, Output)>,
|
||||
{
|
||||
for (key, out) in iter {
|
||||
self.insert(key, out.value())?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Calls insert on each item in the stream.
|
||||
///
|
||||
/// Note that unlike `extend_iter`, this is not generic on the items in
|
||||
/// the stream.
|
||||
///
|
||||
/// If a key is inserted that is less than or equal to any previous key
|
||||
/// added, then an error is returned. Similarly, if there was a problem
|
||||
/// writing to the underlying writer, an error is returned.
|
||||
pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()>
|
||||
where
|
||||
I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
|
||||
S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
|
||||
{
|
||||
let mut stream = stream.into_stream();
|
||||
while let Some((key, out)) = stream.next() {
|
||||
self.insert(key, out.value())?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Finishes the construction of the fst and flushes the underlying
|
||||
/// writer. After completion, the data written to `W` may be read using
|
||||
/// one of `Fst`'s constructor methods.
|
||||
pub fn finish(self) -> Result<()> {
|
||||
self.into_inner()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Just like `finish`, except it returns the underlying writer after
|
||||
/// flushing it.
|
||||
pub fn into_inner(mut self) -> Result<W> {
|
||||
self.compile_from(0)?;
|
||||
let root_node = self.unfinished.pop_root();
|
||||
let root_addr = self.compile(&root_node)?;
|
||||
bytes::io_write_u64_le(self.len as u64, &mut self.wtr)?;
|
||||
bytes::io_write_u64_le(root_addr as u64, &mut self.wtr)?;
|
||||
|
||||
// let sum = self.wtr.masked_checksum();
|
||||
let mut wtr = self.wtr.into_inner();
|
||||
// bytes::io_write_u32_le(sum, &mut wtr)?;
|
||||
wtr.flush()?;
|
||||
Ok(wtr)
|
||||
}
|
||||
|
||||
fn insert_output<B>(&mut self, bs: B, out: Option<Output>) -> Result<()>
|
||||
where
|
||||
B: AsRef<[u8]>,
|
||||
{
|
||||
let bs = bs.as_ref();
|
||||
if bs.is_empty() {
|
||||
self.len = 1;
|
||||
// must be first key, so length is always 1
|
||||
#[allow(clippy::or_fun_call)]
|
||||
self.unfinished.set_root_output(out.unwrap_or(Output::zero()));
|
||||
return Ok(());
|
||||
}
|
||||
let (prefix_len, out) = if let Some(out) = out {
|
||||
self.unfinished.find_common_prefix_and_set_output(bs, out)
|
||||
} else {
|
||||
(self.unfinished.find_common_prefix(bs), Output::zero())
|
||||
};
|
||||
if prefix_len == bs.len() {
|
||||
// If the prefix found consumes the entire set of bytes, then
|
||||
// the prefix *equals* the bytes given. This means it is a
|
||||
// duplicate value with no output. So we can give up here.
|
||||
//
|
||||
// If the below assert fails, then that means we let a duplicate
|
||||
// value through even when inserting outputs.
|
||||
assert!(out.is_zero());
|
||||
return Ok(());
|
||||
}
|
||||
self.len += 1;
|
||||
self.compile_from(prefix_len)?;
|
||||
self.unfinished.add_suffix(&bs[prefix_len..], out);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compile_from(&mut self, istate: usize) -> Result<()> {
|
||||
let mut addr = NONE_ADDRESS;
|
||||
while istate + 1 < self.unfinished.len() {
|
||||
let node = if addr == NONE_ADDRESS {
|
||||
self.unfinished.pop_empty()
|
||||
} else {
|
||||
self.unfinished.pop_freeze(addr)
|
||||
};
|
||||
addr = self.compile(&node)?;
|
||||
assert!(addr != NONE_ADDRESS);
|
||||
}
|
||||
self.unfinished.top_last_freeze(addr);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compile(&mut self, node: &BuilderNode) -> Result<CompiledAddr> {
|
||||
if node.is_final
|
||||
&& node.trans.is_empty()
|
||||
&& node.final_output.is_zero()
|
||||
{
|
||||
return Ok(EMPTY_ADDRESS);
|
||||
}
|
||||
let entry = self.registry.entry(node);
|
||||
if let RegistryEntry::Found(ref addr) = entry {
|
||||
return Ok(*addr);
|
||||
}
|
||||
let start_addr = self.wtr.count() as CompiledAddr;
|
||||
node.compile_to(&mut self.wtr, self.last_addr, start_addr)?;
|
||||
self.last_addr = self.wtr.count() as CompiledAddr - 1;
|
||||
if let RegistryEntry::NotFound(cell) = entry {
|
||||
cell.insert(self.last_addr);
|
||||
}
|
||||
Ok(self.last_addr)
|
||||
}
|
||||
|
||||
fn check_last_key(&mut self, bs: &[u8], check_dupe: bool) -> Result<()> {
|
||||
if let Some(ref mut last) = self.last {
|
||||
if check_dupe && bs == &**last {
|
||||
return Err(Error::DuplicateKey { got: bs.to_vec() }.into());
|
||||
}
|
||||
if bs < &**last {
|
||||
return Err(Error::OutOfOrder {
|
||||
previous: last.to_vec(),
|
||||
got: bs.to_vec(),
|
||||
}
|
||||
.into());
|
||||
}
|
||||
last.clear();
|
||||
for &b in bs {
|
||||
last.push(b);
|
||||
}
|
||||
} else {
|
||||
self.last = Some(bs.to_vec());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Gets a reference to the underlying writer.
|
||||
pub fn get_ref(&self) -> &W {
|
||||
self.wtr.get_ref()
|
||||
}
|
||||
|
||||
/// Returns the number of bytes written to the underlying writer
|
||||
pub fn bytes_written(&self) -> u64 {
|
||||
self.wtr.count()
|
||||
}
|
||||
}
|
||||
|
||||
impl UnfinishedNodes {
|
||||
fn new() -> UnfinishedNodes {
|
||||
let mut unfinished = UnfinishedNodes { stack: Vec::with_capacity(64) };
|
||||
unfinished.push_empty(false);
|
||||
unfinished
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.stack.len()
|
||||
}
|
||||
|
||||
fn push_empty(&mut self, is_final: bool) {
|
||||
self.stack.push(BuilderNodeUnfinished {
|
||||
node: BuilderNode { is_final, ..BuilderNode::default() },
|
||||
last: None,
|
||||
});
|
||||
}
|
||||
|
||||
fn pop_root(&mut self) -> BuilderNode {
|
||||
assert!(self.stack.len() == 1);
|
||||
assert!(self.stack[0].last.is_none());
|
||||
self.stack.pop().unwrap().node
|
||||
}
|
||||
|
||||
fn pop_freeze(&mut self, addr: CompiledAddr) -> BuilderNode {
|
||||
let mut unfinished = self.stack.pop().unwrap();
|
||||
unfinished.last_compiled(addr);
|
||||
unfinished.node
|
||||
}
|
||||
|
||||
fn pop_empty(&mut self) -> BuilderNode {
|
||||
let unfinished = self.stack.pop().unwrap();
|
||||
assert!(unfinished.last.is_none());
|
||||
unfinished.node
|
||||
}
|
||||
|
||||
fn set_root_output(&mut self, out: Output) {
|
||||
self.stack[0].node.is_final = true;
|
||||
self.stack[0].node.final_output = out;
|
||||
}
|
||||
|
||||
fn top_last_freeze(&mut self, addr: CompiledAddr) {
|
||||
let last = self.stack.len().checked_sub(1).unwrap();
|
||||
self.stack[last].last_compiled(addr);
|
||||
}
|
||||
|
||||
fn add_suffix(&mut self, bs: &[u8], out: Output) {
|
||||
if bs.is_empty() {
|
||||
return;
|
||||
}
|
||||
let last = self.stack.len().checked_sub(1).unwrap();
|
||||
assert!(self.stack[last].last.is_none());
|
||||
self.stack[last].last = Some(LastTransition { inp: bs[0], out });
|
||||
for &b in &bs[1..] {
|
||||
self.stack.push(BuilderNodeUnfinished {
|
||||
node: BuilderNode::default(),
|
||||
last: Some(LastTransition { inp: b, out: Output::zero() }),
|
||||
});
|
||||
}
|
||||
self.push_empty(true);
|
||||
}
|
||||
|
||||
fn find_common_prefix(&mut self, bs: &[u8]) -> usize {
|
||||
bs.iter()
|
||||
.zip(&self.stack)
|
||||
.take_while(|&(&b, node)| {
|
||||
node.last.as_ref().map(|t| t.inp == b).unwrap_or(false)
|
||||
})
|
||||
.count()
|
||||
}
|
||||
|
||||
fn find_common_prefix_and_set_output(
|
||||
&mut self,
|
||||
bs: &[u8],
|
||||
mut out: Output,
|
||||
) -> (usize, Output) {
|
||||
let mut i = 0;
|
||||
while i < bs.len() {
|
||||
let add_prefix = match self.stack[i].last.as_mut() {
|
||||
Some(ref mut t) if t.inp == bs[i] => {
|
||||
i += 1;
|
||||
let common_pre = t.out.prefix(out);
|
||||
let add_prefix = t.out.sub(common_pre);
|
||||
out = out.sub(common_pre);
|
||||
t.out = common_pre;
|
||||
add_prefix
|
||||
}
|
||||
_ => break,
|
||||
};
|
||||
if !add_prefix.is_zero() {
|
||||
self.stack[i].add_output_prefix(add_prefix);
|
||||
}
|
||||
}
|
||||
(i, out)
|
||||
}
|
||||
}
|
||||
|
||||
impl BuilderNodeUnfinished {
|
||||
fn last_compiled(&mut self, addr: CompiledAddr) {
|
||||
if let Some(trans) = self.last.take() {
|
||||
self.node.trans.push(Transition {
|
||||
inp: trans.inp,
|
||||
out: trans.out,
|
||||
addr,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
fn add_output_prefix(&mut self, prefix: Output) {
|
||||
if self.node.is_final {
|
||||
self.node.final_output = prefix.cat(self.node.final_output);
|
||||
}
|
||||
for t in &mut self.node.trans {
|
||||
t.out = prefix.cat(t.out);
|
||||
}
|
||||
if let Some(ref mut t) = self.last {
|
||||
t.out = prefix.cat(t.out);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for BuilderNode {
|
||||
fn clone(&self) -> BuilderNode {
|
||||
BuilderNode {
|
||||
is_final: self.is_final,
|
||||
final_output: self.final_output,
|
||||
trans: self.trans.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
fn clone_from(&mut self, source: &BuilderNode) {
|
||||
self.is_final = source.is_final;
|
||||
self.final_output = source.final_output;
|
||||
self.trans.clear();
|
||||
self.trans.extend(source.trans.iter());
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for BuilderNode {
|
||||
fn default() -> BuilderNode {
|
||||
BuilderNode {
|
||||
is_final: false,
|
||||
final_output: Output::zero(),
|
||||
trans: vec![],
|
||||
}
|
||||
}
|
||||
}
|
||||
289
crates/ufst/src/raw/common_inputs.rs
Normal file
289
crates/ufst/src/raw/common_inputs.rs
Normal file
@@ -0,0 +1,289 @@
|
||||
pub const COMMON_INPUTS: [u8; 256] = [
|
||||
84, // '\x00'
|
||||
85, // '\x01'
|
||||
86, // '\x02'
|
||||
87, // '\x03'
|
||||
88, // '\x04'
|
||||
89, // '\x05'
|
||||
90, // '\x06'
|
||||
91, // '\x07'
|
||||
92, // '\x08'
|
||||
93, // '\t'
|
||||
94, // '\n'
|
||||
95, // '\x0b'
|
||||
96, // '\x0c'
|
||||
97, // '\r'
|
||||
98, // '\x0e'
|
||||
99, // '\x0f'
|
||||
100, // '\x10'
|
||||
101, // '\x11'
|
||||
102, // '\x12'
|
||||
103, // '\x13'
|
||||
104, // '\x14'
|
||||
105, // '\x15'
|
||||
106, // '\x16'
|
||||
107, // '\x17'
|
||||
108, // '\x18'
|
||||
109, // '\x19'
|
||||
110, // '\x1a'
|
||||
111, // '\x1b'
|
||||
112, // '\x1c'
|
||||
113, // '\x1d'
|
||||
114, // '\x1e'
|
||||
115, // '\x1f'
|
||||
116, // ' '
|
||||
80, // '!'
|
||||
117, // '"'
|
||||
118, // '#'
|
||||
79, // '$'
|
||||
39, // '%'
|
||||
30, // '&'
|
||||
81, // "'"
|
||||
75, // '('
|
||||
74, // ')'
|
||||
82, // '*'
|
||||
57, // '+'
|
||||
66, // ','
|
||||
16, // '-'
|
||||
12, // '.'
|
||||
2, // '/'
|
||||
19, // '0'
|
||||
20, // '1'
|
||||
21, // '2'
|
||||
27, // '3'
|
||||
32, // '4'
|
||||
29, // '5'
|
||||
35, // '6'
|
||||
36, // '7'
|
||||
37, // '8'
|
||||
34, // '9'
|
||||
24, // ':'
|
||||
73, // ';'
|
||||
119, // '<'
|
||||
23, // '='
|
||||
120, // '>'
|
||||
40, // '?'
|
||||
83, // '@'
|
||||
44, // 'A'
|
||||
48, // 'B'
|
||||
42, // 'C'
|
||||
43, // 'D'
|
||||
49, // 'E'
|
||||
46, // 'F'
|
||||
62, // 'G'
|
||||
61, // 'H'
|
||||
47, // 'I'
|
||||
69, // 'J'
|
||||
68, // 'K'
|
||||
58, // 'L'
|
||||
56, // 'M'
|
||||
55, // 'N'
|
||||
59, // 'O'
|
||||
51, // 'P'
|
||||
72, // 'Q'
|
||||
54, // 'R'
|
||||
45, // 'S'
|
||||
52, // 'T'
|
||||
64, // 'U'
|
||||
65, // 'V'
|
||||
63, // 'W'
|
||||
71, // 'X'
|
||||
67, // 'Y'
|
||||
70, // 'Z'
|
||||
77, // '['
|
||||
121, // '\\'
|
||||
78, // ']'
|
||||
122, // '^'
|
||||
31, // '_'
|
||||
123, // '`'
|
||||
4, // 'a'
|
||||
25, // 'b'
|
||||
9, // 'c'
|
||||
17, // 'd'
|
||||
1, // 'e'
|
||||
26, // 'f'
|
||||
22, // 'g'
|
||||
13, // 'h'
|
||||
7, // 'i'
|
||||
50, // 'j'
|
||||
38, // 'k'
|
||||
14, // 'l'
|
||||
15, // 'm'
|
||||
10, // 'n'
|
||||
3, // 'o'
|
||||
8, // 'p'
|
||||
60, // 'q'
|
||||
6, // 'r'
|
||||
5, // 's'
|
||||
0, // 't'
|
||||
18, // 'u'
|
||||
33, // 'v'
|
||||
11, // 'w'
|
||||
41, // 'x'
|
||||
28, // 'y'
|
||||
53, // 'z'
|
||||
124, // '{'
|
||||
125, // '|'
|
||||
126, // '}'
|
||||
76, // '~'
|
||||
127, // '\x7f'
|
||||
128, // '\x80'
|
||||
129, // '\x81'
|
||||
130, // '\x82'
|
||||
131, // '\x83'
|
||||
132, // '\x84'
|
||||
133, // '\x85'
|
||||
134, // '\x86'
|
||||
135, // '\x87'
|
||||
136, // '\x88'
|
||||
137, // '\x89'
|
||||
138, // '\x8a'
|
||||
139, // '\x8b'
|
||||
140, // '\x8c'
|
||||
141, // '\x8d'
|
||||
142, // '\x8e'
|
||||
143, // '\x8f'
|
||||
144, // '\x90'
|
||||
145, // '\x91'
|
||||
146, // '\x92'
|
||||
147, // '\x93'
|
||||
148, // '\x94'
|
||||
149, // '\x95'
|
||||
150, // '\x96'
|
||||
151, // '\x97'
|
||||
152, // '\x98'
|
||||
153, // '\x99'
|
||||
154, // '\x9a'
|
||||
155, // '\x9b'
|
||||
156, // '\x9c'
|
||||
157, // '\x9d'
|
||||
158, // '\x9e'
|
||||
159, // '\x9f'
|
||||
160, // '\xa0'
|
||||
161, // '¡'
|
||||
162, // '¢'
|
||||
163, // '£'
|
||||
164, // '¤'
|
||||
165, // '¥'
|
||||
166, // '¦'
|
||||
167, // '§'
|
||||
168, // '¨'
|
||||
169, // '©'
|
||||
170, // 'ª'
|
||||
171, // '«'
|
||||
172, // '¬'
|
||||
173, // '\xad'
|
||||
174, // '®'
|
||||
175, // '¯'
|
||||
176, // '°'
|
||||
177, // '±'
|
||||
178, // '²'
|
||||
179, // '³'
|
||||
180, // '´'
|
||||
181, // 'µ'
|
||||
182, // '¶'
|
||||
183, // '·'
|
||||
184, // '¸'
|
||||
185, // '¹'
|
||||
186, // 'º'
|
||||
187, // '»'
|
||||
188, // '¼'
|
||||
189, // '½'
|
||||
190, // '¾'
|
||||
191, // '¿'
|
||||
192, // 'À'
|
||||
193, // 'Á'
|
||||
194, // 'Â'
|
||||
195, // 'Ã'
|
||||
196, // 'Ä'
|
||||
197, // 'Å'
|
||||
198, // 'Æ'
|
||||
199, // 'Ç'
|
||||
200, // 'È'
|
||||
201, // 'É'
|
||||
202, // 'Ê'
|
||||
203, // 'Ë'
|
||||
204, // 'Ì'
|
||||
205, // 'Í'
|
||||
206, // 'Î'
|
||||
207, // 'Ï'
|
||||
208, // 'Ð'
|
||||
209, // 'Ñ'
|
||||
210, // 'Ò'
|
||||
211, // 'Ó'
|
||||
212, // 'Ô'
|
||||
213, // 'Õ'
|
||||
214, // 'Ö'
|
||||
215, // '×'
|
||||
216, // 'Ø'
|
||||
217, // 'Ù'
|
||||
218, // 'Ú'
|
||||
219, // 'Û'
|
||||
220, // 'Ü'
|
||||
221, // 'Ý'
|
||||
222, // 'Þ'
|
||||
223, // 'ß'
|
||||
224, // 'à'
|
||||
225, // 'á'
|
||||
226, // 'â'
|
||||
227, // 'ã'
|
||||
228, // 'ä'
|
||||
229, // 'å'
|
||||
230, // 'æ'
|
||||
231, // 'ç'
|
||||
232, // 'è'
|
||||
233, // 'é'
|
||||
234, // 'ê'
|
||||
235, // 'ë'
|
||||
236, // 'ì'
|
||||
237, // 'í'
|
||||
238, // 'î'
|
||||
239, // 'ï'
|
||||
240, // 'ð'
|
||||
241, // 'ñ'
|
||||
242, // 'ò'
|
||||
243, // 'ó'
|
||||
244, // 'ô'
|
||||
245, // 'õ'
|
||||
246, // 'ö'
|
||||
247, // '÷'
|
||||
248, // 'ø'
|
||||
249, // 'ù'
|
||||
250, // 'ú'
|
||||
251, // 'û'
|
||||
252, // 'ü'
|
||||
253, // 'ý'
|
||||
254, // 'þ'
|
||||
255, // 'ÿ'
|
||||
];
|
||||
|
||||
pub const COMMON_INPUTS_INV: [u8; 256] = [
|
||||
b't', b'e', b'/', b'o', b'a', b's', b'r', b'i', b'p', b'c', b'n', b'w',
|
||||
b'.', b'h', b'l', b'm', b'-', b'd', b'u', b'0', b'1', b'2', b'g', b'=',
|
||||
b':', b'b', b'f', b'3', b'y', b'5', b'&', b'_', b'4', b'v', b'9', b'6',
|
||||
b'7', b'8', b'k', b'%', b'?', b'x', b'C', b'D', b'A', b'S', b'F', b'I',
|
||||
b'B', b'E', b'j', b'P', b'T', b'z', b'R', b'N', b'M', b'+', b'L', b'O',
|
||||
b'q', b'H', b'G', b'W', b'U', b'V', b',', b'Y', b'K', b'J', b'Z', b'X',
|
||||
b'Q', b';', b')', b'(', b'~', b'[', b']', b'$', b'!', b'\'', b'*', b'@',
|
||||
b'\x00', b'\x01', b'\x02', b'\x03', b'\x04', b'\x05', b'\x06', b'\x07',
|
||||
b'\x08', b'\t', b'\n', b'\x0b', b'\x0c', b'\r', b'\x0e', b'\x0f', b'\x10',
|
||||
b'\x11', b'\x12', b'\x13', b'\x14', b'\x15', b'\x16', b'\x17', b'\x18',
|
||||
b'\x19', b'\x1a', b'\x1b', b'\x1c', b'\x1d', b'\x1e', b'\x1f', b' ', b'"',
|
||||
b'#', b'<', b'>', b'\\', b'^', b'`', b'{', b'|', b'}', b'\x7f', b'\x80',
|
||||
b'\x81', b'\x82', b'\x83', b'\x84', b'\x85', b'\x86', b'\x87', b'\x88',
|
||||
b'\x89', b'\x8a', b'\x8b', b'\x8c', b'\x8d', b'\x8e', b'\x8f', b'\x90',
|
||||
b'\x91', b'\x92', b'\x93', b'\x94', b'\x95', b'\x96', b'\x97', b'\x98',
|
||||
b'\x99', b'\x9a', b'\x9b', b'\x9c', b'\x9d', b'\x9e', b'\x9f', b'\xa0',
|
||||
b'\xa1', b'\xa2', b'\xa3', b'\xa4', b'\xa5', b'\xa6', b'\xa7', b'\xa8',
|
||||
b'\xa9', b'\xaa', b'\xab', b'\xac', b'\xad', b'\xae', b'\xaf', b'\xb0',
|
||||
b'\xb1', b'\xb2', b'\xb3', b'\xb4', b'\xb5', b'\xb6', b'\xb7', b'\xb8',
|
||||
b'\xb9', b'\xba', b'\xbb', b'\xbc', b'\xbd', b'\xbe', b'\xbf', b'\xc0',
|
||||
b'\xc1', b'\xc2', b'\xc3', b'\xc4', b'\xc5', b'\xc6', b'\xc7', b'\xc8',
|
||||
b'\xc9', b'\xca', b'\xcb', b'\xcc', b'\xcd', b'\xce', b'\xcf', b'\xd0',
|
||||
b'\xd1', b'\xd2', b'\xd3', b'\xd4', b'\xd5', b'\xd6', b'\xd7', b'\xd8',
|
||||
b'\xd9', b'\xda', b'\xdb', b'\xdc', b'\xdd', b'\xde', b'\xdf', b'\xe0',
|
||||
b'\xe1', b'\xe2', b'\xe3', b'\xe4', b'\xe5', b'\xe6', b'\xe7', b'\xe8',
|
||||
b'\xe9', b'\xea', b'\xeb', b'\xec', b'\xed', b'\xee', b'\xef', b'\xf0',
|
||||
b'\xf1', b'\xf2', b'\xf3', b'\xf4', b'\xf5', b'\xf6', b'\xf7', b'\xf8',
|
||||
b'\xf9', b'\xfa', b'\xfb', b'\xfc', b'\xfd', b'\xfe', b'\xff',
|
||||
];
|
||||
57
crates/ufst/src/raw/counting_writer.rs
Normal file
57
crates/ufst/src/raw/counting_writer.rs
Normal file
@@ -0,0 +1,57 @@
|
||||
use std::io;
|
||||
|
||||
/// Wraps any writer that counts and checksums bytes written.
|
||||
pub struct CountingWriter<W> {
|
||||
wtr: W,
|
||||
cnt: u64,
|
||||
}
|
||||
|
||||
impl<W: io::Write> CountingWriter<W> {
|
||||
/// Wrap the given writer with a counter.
|
||||
pub fn new(wtr: W) -> CountingWriter<W> {
|
||||
CountingWriter { wtr, cnt: 0 }
|
||||
}
|
||||
|
||||
/// Return the total number of bytes written to the underlying writer.
|
||||
///
|
||||
/// The count returned is the sum of all counts resulting from a call
|
||||
/// to `write`.
|
||||
pub fn count(&self) -> u64 {
|
||||
self.cnt
|
||||
}
|
||||
|
||||
/// Unwrap the counting writer and return the inner writer.
|
||||
pub fn into_inner(self) -> W {
|
||||
self.wtr
|
||||
}
|
||||
|
||||
/// Gets a reference to the underlying writer.
|
||||
pub fn get_ref(&self) -> &W {
|
||||
&self.wtr
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: io::Write> io::Write for CountingWriter<W> {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
let n = self.wtr.write(buf)?;
|
||||
self.cnt += n as u64;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
self.wtr.flush()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::CountingWriter;
|
||||
use std::io::Write;
|
||||
|
||||
#[test]
|
||||
fn counts_bytes() {
|
||||
let mut wtr = CountingWriter::new(vec![]);
|
||||
wtr.write_all(b"foobar").unwrap();
|
||||
assert_eq!(wtr.count(), 6);
|
||||
}
|
||||
}
|
||||
97
crates/ufst/src/raw/error.rs
Normal file
97
crates/ufst/src/raw/error.rs
Normal file
@@ -0,0 +1,97 @@
|
||||
use std::fmt;
|
||||
use std::str;
|
||||
use std::string::FromUtf8Error;
|
||||
|
||||
/// An error that occurred while using a finite state transducer.
|
||||
///
|
||||
/// This enum is non-exhaustive. New variants may be added to it in
|
||||
/// compatible releases.
|
||||
#[non_exhaustive]
|
||||
pub enum Error {
|
||||
/// An unexpected error occurred while reading a finite state transducer.
|
||||
/// Usually this occurs because the data is corrupted or is not actually
|
||||
/// a finite state transducer serialized by this library.
|
||||
Format {
|
||||
/// The number of bytes given to the FST constructor.
|
||||
size: usize,
|
||||
},
|
||||
/// A duplicate key was inserted into a finite state transducer, which is
|
||||
/// not allowed.
|
||||
DuplicateKey {
|
||||
/// The duplicate key.
|
||||
got: Vec<u8>,
|
||||
},
|
||||
/// A key was inserted out of order into a finite state transducer.
|
||||
///
|
||||
/// Keys must always be inserted in lexicographic order.
|
||||
OutOfOrder {
|
||||
/// The last key successfully inserted.
|
||||
previous: Vec<u8>,
|
||||
/// The key that caused this error to occur.
|
||||
got: Vec<u8>,
|
||||
},
|
||||
/// An error that occurred when trying to decode a UTF-8 byte key.
|
||||
FromUtf8(FromUtf8Error),
|
||||
}
|
||||
|
||||
impl fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match *self {
|
||||
Error::FromUtf8(ref err) => err.fmt(f),
|
||||
Error::Format { size } => write!(
|
||||
f,
|
||||
"\
|
||||
Error opening FST with size {} bytes: An unknown error occurred. This \
|
||||
usually means you're trying to read data that isn't actually an encoded FST.",
|
||||
size
|
||||
),
|
||||
Error::DuplicateKey { ref got } => write!(
|
||||
f,
|
||||
"Error inserting duplicate key: '{}'.",
|
||||
format_bytes(&*got)
|
||||
),
|
||||
Error::OutOfOrder { ref previous, ref got } => write!(
|
||||
f,
|
||||
"\
|
||||
Error inserting out-of-order key: '{}'. (Previous key was '{}'.) Keys must be \
|
||||
inserted in lexicographic order.",
|
||||
format_bytes(&*got),
|
||||
format_bytes(&*previous)
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fmt::Display::fmt(self, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for Error {
|
||||
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
|
||||
match *self {
|
||||
Error::FromUtf8(ref err) => Some(err),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FromUtf8Error> for Error {
|
||||
#[inline]
|
||||
fn from(err: FromUtf8Error) -> Error {
|
||||
Error::FromUtf8(err)
|
||||
}
|
||||
}
|
||||
|
||||
/// Attempt to convert an arbitrary byte string to a more convenient display
|
||||
/// form.
|
||||
///
|
||||
/// Essentially, try to decode the bytes as UTF-8 and show that. Failing that,
|
||||
/// just show the sequence of bytes.
|
||||
fn format_bytes(bytes: &[u8]) -> String {
|
||||
match str::from_utf8(bytes) {
|
||||
Ok(s) => s.to_owned(),
|
||||
Err(_) => format!("{:?}", bytes),
|
||||
}
|
||||
}
|
||||
1179
crates/ufst/src/raw/mod.rs
Normal file
1179
crates/ufst/src/raw/mod.rs
Normal file
File diff suppressed because it is too large
Load Diff
993
crates/ufst/src/raw/node.rs
Normal file
993
crates/ufst/src/raw/node.rs
Normal file
@@ -0,0 +1,993 @@
|
||||
#![allow(clippy::unusual_byte_groupings)]
|
||||
|
||||
use std::cmp;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::bytes;
|
||||
use crate::raw::build::BuilderNode;
|
||||
use crate::raw::common_inputs::{COMMON_INPUTS, COMMON_INPUTS_INV};
|
||||
use crate::raw::{
|
||||
u64_to_usize, CompiledAddr, Output, Transition, EMPTY_ADDRESS,
|
||||
};
|
||||
|
||||
/// The threshold (in number of transitions) at which an index is created for
|
||||
/// a node's transitions. This speeds up lookup time at the expense of FST
|
||||
/// size.
|
||||
const TRANS_INDEX_THRESHOLD: usize = 32;
|
||||
|
||||
/// Node represents a single state in a finite state transducer.
|
||||
///
|
||||
/// Nodes are very cheap to construct. Notably, they satisfy the `Copy` trait.
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct Node<'f> {
|
||||
data: &'f [u8],
|
||||
state: State,
|
||||
start: CompiledAddr,
|
||||
end: usize,
|
||||
is_final: bool,
|
||||
ntrans: usize,
|
||||
sizes: PackSizes,
|
||||
final_output: Output,
|
||||
}
|
||||
|
||||
impl<'f> fmt::Debug for Node<'f> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
writeln!(f, "NODE@{}", self.start)?;
|
||||
writeln!(f, " end_addr: {}", self.end)?;
|
||||
writeln!(f, " size: {} bytes", self.as_slice().len())?;
|
||||
writeln!(f, " state: {:?}", self.state)?;
|
||||
writeln!(f, " is_final: {}", self.is_final())?;
|
||||
writeln!(f, " final_output: {:?}", self.final_output())?;
|
||||
writeln!(f, " # transitions: {}", self.len())?;
|
||||
writeln!(f, " transitions:")?;
|
||||
for t in self.transitions() {
|
||||
writeln!(f, " {:?}", t)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'f> Node<'f> {
|
||||
/// Creates a new note at the address given.
|
||||
///
|
||||
/// `data` should be a slice to an entire FST.
|
||||
pub(crate) fn new(addr: CompiledAddr, data: &[u8]) -> Node<'_> {
|
||||
let state = State::new(data, addr);
|
||||
match state {
|
||||
State::EmptyFinal => Node {
|
||||
data: &[],
|
||||
state: State::EmptyFinal,
|
||||
start: EMPTY_ADDRESS,
|
||||
end: EMPTY_ADDRESS,
|
||||
is_final: true,
|
||||
ntrans: 0,
|
||||
sizes: PackSizes::new(),
|
||||
final_output: Output::zero(),
|
||||
},
|
||||
State::OneTransNext(s) => {
|
||||
let data = &data[..addr + 1];
|
||||
Node {
|
||||
data,
|
||||
state,
|
||||
start: addr,
|
||||
end: s.end_addr(data),
|
||||
is_final: false,
|
||||
sizes: PackSizes::new(),
|
||||
ntrans: 1,
|
||||
final_output: Output::zero(),
|
||||
}
|
||||
}
|
||||
State::OneTrans(s) => {
|
||||
let data = &data[..addr + 1];
|
||||
let sizes = s.sizes(data);
|
||||
Node {
|
||||
data,
|
||||
state,
|
||||
start: addr,
|
||||
end: s.end_addr(data, sizes),
|
||||
is_final: false,
|
||||
ntrans: 1,
|
||||
sizes,
|
||||
final_output: Output::zero(),
|
||||
}
|
||||
}
|
||||
State::AnyTrans(s) => {
|
||||
let data = &data[..addr + 1];
|
||||
let sizes = s.sizes(data);
|
||||
let ntrans = s.ntrans(data);
|
||||
Node {
|
||||
data,
|
||||
state,
|
||||
start: addr,
|
||||
end: s.end_addr(data, sizes, ntrans),
|
||||
is_final: s.is_final_state(),
|
||||
ntrans,
|
||||
sizes,
|
||||
final_output: s.final_output(data, sizes, ntrans),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/// Returns an iterator over all transitions in this node in lexicographic
|
||||
/// order.
|
||||
#[inline]
|
||||
pub fn transitions<'n>(&'n self) -> Transitions<'f, 'n> {
|
||||
Transitions { node: self, range: 0..self.len() }
|
||||
}
|
||||
|
||||
/// Returns the transition at index `i`.
|
||||
#[inline(always)]
|
||||
pub fn transition(&self, i: usize) -> Transition {
|
||||
// The `inline(always)` annotation on this function appears to
|
||||
// dramatically speed up FST traversal. In particular, measuring the
|
||||
// time it takes to run `fst range something-big.fst` shows almost a 2x
|
||||
// improvement with this single `inline(always)` annotation.
|
||||
|
||||
match self.state {
|
||||
State::OneTransNext(s) => {
|
||||
assert!(i == 0);
|
||||
Transition {
|
||||
inp: s.input(self),
|
||||
out: Output::zero(),
|
||||
addr: s.trans_addr(self),
|
||||
}
|
||||
}
|
||||
State::OneTrans(s) => {
|
||||
assert!(i == 0);
|
||||
Transition {
|
||||
inp: s.input(self),
|
||||
out: s.output(self),
|
||||
addr: s.trans_addr(self),
|
||||
}
|
||||
}
|
||||
State::AnyTrans(s) => Transition {
|
||||
inp: s.input(self, i),
|
||||
out: s.output(self, i),
|
||||
addr: s.trans_addr(self, i),
|
||||
},
|
||||
State::EmptyFinal => panic!("out of bounds"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the transition address of the `i`th transition.
|
||||
#[inline]
|
||||
pub fn transition_addr(&self, i: usize) -> CompiledAddr {
|
||||
match self.state {
|
||||
State::OneTransNext(s) => {
|
||||
assert!(i == 0);
|
||||
s.trans_addr(self)
|
||||
}
|
||||
State::OneTrans(s) => {
|
||||
assert!(i == 0);
|
||||
s.trans_addr(self)
|
||||
}
|
||||
State::AnyTrans(s) => s.trans_addr(self, i),
|
||||
State::EmptyFinal => panic!("out of bounds"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Finds the `i`th transition corresponding to the given input byte.
|
||||
///
|
||||
/// If no transition for this byte exists, then `None` is returned.
|
||||
#[inline]
|
||||
pub fn find_input(&self, b: u8) -> Option<usize> {
|
||||
match self.state {
|
||||
State::OneTransNext(s) if s.input(self) == b => Some(0),
|
||||
State::OneTransNext(_) => None,
|
||||
State::OneTrans(s) if s.input(self) == b => Some(0),
|
||||
State::OneTrans(_) => None,
|
||||
State::AnyTrans(s) => s.find_input(self, b),
|
||||
State::EmptyFinal => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// If this node is final and has a terminal output value, then it is
|
||||
/// returned. Otherwise, a zero output is returned.
|
||||
#[inline]
|
||||
pub fn final_output(&self) -> Output {
|
||||
self.final_output
|
||||
}
|
||||
|
||||
/// Returns true if and only if this node corresponds to a final or "match"
|
||||
/// state in the finite state transducer.
|
||||
#[inline]
|
||||
pub fn is_final(&self) -> bool {
|
||||
self.is_final
|
||||
}
|
||||
|
||||
/// Returns the number of transitions in this node.
|
||||
///
|
||||
/// The maximum number of transitions is 256.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
self.ntrans
|
||||
}
|
||||
|
||||
/// Returns true if and only if this node has zero transitions.
|
||||
#[inline]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.ntrans == 0
|
||||
}
|
||||
|
||||
/// Return the address of this node.
|
||||
#[inline]
|
||||
pub fn addr(&self) -> CompiledAddr {
|
||||
self.start
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
#[inline]
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
&self.data[self.end..]
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
#[inline]
|
||||
pub fn state(&self) -> &'static str {
|
||||
match self.state {
|
||||
State::OneTransNext(_) => "OTN",
|
||||
State::OneTrans(_) => "OT",
|
||||
State::AnyTrans(_) => "AT",
|
||||
State::EmptyFinal => "EF",
|
||||
}
|
||||
}
|
||||
|
||||
fn compile<W: io::Write>(
|
||||
wtr: W,
|
||||
last_addr: CompiledAddr,
|
||||
addr: CompiledAddr,
|
||||
node: &BuilderNode,
|
||||
) -> io::Result<()> {
|
||||
assert!(node.trans.len() <= 256);
|
||||
if node.trans.is_empty()
|
||||
&& node.is_final
|
||||
&& node.final_output.is_zero()
|
||||
{
|
||||
Ok(())
|
||||
} else if node.trans.len() != 1 || node.is_final {
|
||||
StateAnyTrans::compile(wtr, addr, node)
|
||||
} else if node.trans[0].addr == last_addr
|
||||
&& node.trans[0].out.is_zero()
|
||||
{
|
||||
StateOneTransNext::compile(wtr, addr, node.trans[0].inp)
|
||||
} else {
|
||||
StateOneTrans::compile(wtr, addr, node.trans[0])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BuilderNode {
|
||||
pub fn compile_to<W: io::Write>(
|
||||
&self,
|
||||
wtr: W,
|
||||
last_addr: CompiledAddr,
|
||||
addr: CompiledAddr,
|
||||
) -> io::Result<()> {
|
||||
Node::compile(wtr, last_addr, addr, self)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
enum State {
|
||||
OneTransNext(StateOneTransNext),
|
||||
OneTrans(StateOneTrans),
|
||||
AnyTrans(StateAnyTrans),
|
||||
EmptyFinal,
|
||||
}
|
||||
|
||||
// one trans flag (1), next flag (1), common input
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
struct StateOneTransNext(u8);
|
||||
// one trans flag (1), next flag (0), common input
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
struct StateOneTrans(u8);
|
||||
// one trans flag (0), final flag, # transitions
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
struct StateAnyTrans(u8);
|
||||
|
||||
impl State {
|
||||
fn new(data: &[u8], addr: CompiledAddr) -> State {
|
||||
if addr == EMPTY_ADDRESS {
|
||||
return State::EmptyFinal;
|
||||
}
|
||||
let v = data[addr];
|
||||
match (v & 0b11_000000) >> 6 {
|
||||
0b11 => State::OneTransNext(StateOneTransNext(v)),
|
||||
0b10 => State::OneTrans(StateOneTrans(v)),
|
||||
_ => State::AnyTrans(StateAnyTrans(v)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl StateOneTransNext {
|
||||
fn compile<W: io::Write>(
|
||||
mut wtr: W,
|
||||
_: CompiledAddr,
|
||||
input: u8,
|
||||
) -> io::Result<()> {
|
||||
let mut state = StateOneTransNext::new();
|
||||
state.set_common_input(input);
|
||||
if state.common_input().is_none() {
|
||||
wtr.write_all(&[input])?;
|
||||
}
|
||||
wtr.write_all(&[state.0])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn new() -> StateOneTransNext {
|
||||
StateOneTransNext(0b11_000000)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn set_common_input(&mut self, input: u8) {
|
||||
self.0 = (self.0 & 0b11_000000) | common_idx(input, 0b111111);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn common_input(&self) -> Option<u8> {
|
||||
common_input(self.0 & 0b00_111111)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn input_len(&self) -> usize {
|
||||
if self.common_input().is_none() {
|
||||
1
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn end_addr(&self, data: &[u8]) -> usize {
|
||||
data.len() - 1 - self.input_len()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn input(&self, node: &Node<'_>) -> u8 {
|
||||
if let Some(inp) = self.common_input() {
|
||||
inp
|
||||
} else {
|
||||
node.data[node.start - 1]
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn trans_addr(&self, node: &Node<'_>) -> CompiledAddr {
|
||||
node.end as CompiledAddr - 1
|
||||
}
|
||||
}
|
||||
|
||||
impl StateOneTrans {
|
||||
fn compile<W: io::Write>(
|
||||
mut wtr: W,
|
||||
addr: CompiledAddr,
|
||||
trans: Transition,
|
||||
) -> io::Result<()> {
|
||||
let out = trans.out.value();
|
||||
let output_pack_size =
|
||||
if out == 0 { 0 } else { bytes::pack_uint(&mut wtr, out)? };
|
||||
let trans_pack_size = pack_delta(&mut wtr, addr, trans.addr)?;
|
||||
|
||||
let mut pack_sizes = PackSizes::new();
|
||||
pack_sizes.set_output_pack_size(output_pack_size);
|
||||
pack_sizes.set_transition_pack_size(trans_pack_size);
|
||||
wtr.write_all(&[pack_sizes.encode()])?;
|
||||
|
||||
let mut state = StateOneTrans::new();
|
||||
state.set_common_input(trans.inp);
|
||||
if state.common_input().is_none() {
|
||||
wtr.write_all(&[trans.inp])?;
|
||||
}
|
||||
wtr.write_all(&[state.0])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn new() -> StateOneTrans {
|
||||
StateOneTrans(0b10_000000)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn set_common_input(&mut self, input: u8) {
|
||||
self.0 = (self.0 & 0b10_000000) | common_idx(input, 0b111111);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn common_input(&self) -> Option<u8> {
|
||||
common_input(self.0 & 0b00_111111)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn input_len(&self) -> usize {
|
||||
if self.common_input().is_none() {
|
||||
1
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn sizes(&self, data: &[u8]) -> PackSizes {
|
||||
let i = data.len() - 1 - self.input_len() - 1;
|
||||
PackSizes::decode(data[i])
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn end_addr(&self, data: &[u8], sizes: PackSizes) -> usize {
|
||||
data.len() - 1
|
||||
- self.input_len()
|
||||
- 1 // pack size
|
||||
- sizes.transition_pack_size()
|
||||
- sizes.output_pack_size()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn input(&self, node: &Node<'_>) -> u8 {
|
||||
if let Some(inp) = self.common_input() {
|
||||
inp
|
||||
} else {
|
||||
node.data[node.start - 1]
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn output(&self, node: &Node<'_>) -> Output {
|
||||
let osize = node.sizes.output_pack_size();
|
||||
if osize == 0 {
|
||||
return Output::zero();
|
||||
}
|
||||
let tsize = node.sizes.transition_pack_size();
|
||||
let i = node.start
|
||||
- self.input_len()
|
||||
- 1 // pack size
|
||||
- tsize - osize;
|
||||
Output::new(bytes::unpack_uint(&node.data[i..], osize as u8))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn trans_addr(&self, node: &Node<'_>) -> CompiledAddr {
|
||||
let tsize = node.sizes.transition_pack_size();
|
||||
let i = node.start
|
||||
- self.input_len()
|
||||
- 1 // pack size
|
||||
- tsize;
|
||||
unpack_delta(&node.data[i..], tsize, node.end)
|
||||
}
|
||||
}
|
||||
|
||||
impl StateAnyTrans {
|
||||
fn compile<W: io::Write>(
|
||||
mut wtr: W,
|
||||
addr: CompiledAddr,
|
||||
node: &BuilderNode,
|
||||
) -> io::Result<()> {
|
||||
assert!(node.trans.len() <= 256);
|
||||
|
||||
let mut tsize = 0;
|
||||
let mut osize = bytes::pack_size(node.final_output.value());
|
||||
let mut any_outs = !node.final_output.is_zero();
|
||||
for t in &node.trans {
|
||||
tsize = cmp::max(tsize, pack_delta_size(addr, t.addr));
|
||||
osize = cmp::max(osize, bytes::pack_size(t.out.value()));
|
||||
any_outs = any_outs || !t.out.is_zero();
|
||||
}
|
||||
|
||||
let mut pack_sizes = PackSizes::new();
|
||||
if any_outs {
|
||||
pack_sizes.set_output_pack_size(osize);
|
||||
} else {
|
||||
pack_sizes.set_output_pack_size(0);
|
||||
}
|
||||
pack_sizes.set_transition_pack_size(tsize);
|
||||
|
||||
let mut state = StateAnyTrans::new();
|
||||
state.set_final_state(node.is_final);
|
||||
state.set_state_ntrans(node.trans.len() as u8);
|
||||
|
||||
if any_outs {
|
||||
if node.is_final {
|
||||
bytes::pack_uint_in(
|
||||
&mut wtr,
|
||||
node.final_output.value(),
|
||||
osize,
|
||||
)?;
|
||||
}
|
||||
for t in node.trans.iter().rev() {
|
||||
bytes::pack_uint_in(&mut wtr, t.out.value(), osize)?;
|
||||
}
|
||||
}
|
||||
for t in node.trans.iter().rev() {
|
||||
pack_delta_in(&mut wtr, addr, t.addr, tsize)?;
|
||||
}
|
||||
for t in node.trans.iter().rev() {
|
||||
wtr.write_all(&[t.inp])?;
|
||||
}
|
||||
if node.trans.len() > TRANS_INDEX_THRESHOLD {
|
||||
// A value of 255 indicates that no transition exists for the byte
|
||||
// at that index. (Except when there are 256 transitions.) Namely,
|
||||
// any value greater than or equal to the number of transitions in
|
||||
// this node indicates an absent transition.
|
||||
let mut index = [255u8; 256];
|
||||
for (i, t) in node.trans.iter().enumerate() {
|
||||
index[t.inp as usize] = i as u8;
|
||||
}
|
||||
wtr.write_all(&index)?;
|
||||
}
|
||||
|
||||
wtr.write_all(&[pack_sizes.encode()])?;
|
||||
if state.state_ntrans().is_none() {
|
||||
if node.trans.len() == 256 {
|
||||
// 256 can't be represented in a u8, so we abuse the fact that
|
||||
// the # of transitions can never be 1 here, since 1 is always
|
||||
// encoded in the state byte.
|
||||
wtr.write_all(&[1])?;
|
||||
} else {
|
||||
wtr.write_all(&[node.trans.len() as u8])?;
|
||||
}
|
||||
}
|
||||
wtr.write_all(&[state.0])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn new() -> StateAnyTrans {
|
||||
StateAnyTrans(0b00_000000)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn set_final_state(&mut self, yes: bool) {
|
||||
if yes {
|
||||
self.0 |= 0b01_000000;
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_final_state(&self) -> bool {
|
||||
self.0 & 0b01_000000 == 0b01_000000
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn set_state_ntrans(&mut self, n: u8) {
|
||||
if n <= 0b00_111111 {
|
||||
self.0 = (self.0 & 0b11_000000) | n;
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn state_ntrans(&self) -> Option<u8> {
|
||||
let n = self.0 & 0b00_111111;
|
||||
if n == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(n)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn sizes(&self, data: &[u8]) -> PackSizes {
|
||||
let i = data.len() - 1 - self.ntrans_len() - 1;
|
||||
PackSizes::decode(data[i])
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn total_trans_size(&self, sizes: PackSizes, ntrans: usize) -> usize {
|
||||
let index_size = self.trans_index_size(ntrans);
|
||||
ntrans + (ntrans * sizes.transition_pack_size()) + index_size
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn trans_index_size(&self, ntrans: usize) -> usize {
|
||||
if ntrans > TRANS_INDEX_THRESHOLD {
|
||||
256
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn ntrans_len(&self) -> usize {
|
||||
if self.state_ntrans().is_none() {
|
||||
1
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn ntrans(&self, data: &[u8]) -> usize {
|
||||
if let Some(n) = self.state_ntrans() {
|
||||
n as usize
|
||||
} else {
|
||||
let n = data[data.len() - 2] as usize;
|
||||
if n == 1 {
|
||||
// "1" is never a normal legal value here, because if there
|
||||
// is only 1 transition, then it is encoded in the state byte.
|
||||
256
|
||||
} else {
|
||||
n
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn final_output(
|
||||
&self,
|
||||
data: &[u8],
|
||||
sizes: PackSizes,
|
||||
ntrans: usize,
|
||||
) -> Output {
|
||||
let osize = sizes.output_pack_size();
|
||||
if osize == 0 || !self.is_final_state() {
|
||||
return Output::zero();
|
||||
}
|
||||
let at = data.len() - 1
|
||||
- self.ntrans_len()
|
||||
- 1 // pack size
|
||||
- self.total_trans_size(sizes, ntrans)
|
||||
- (ntrans * osize) // output values
|
||||
- osize; // the desired output value
|
||||
Output::new(bytes::unpack_uint(&data[at..], osize as u8))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn end_addr(&self, data: &[u8], sizes: PackSizes, ntrans: usize) -> usize {
|
||||
let osize = sizes.output_pack_size();
|
||||
let final_osize = if !self.is_final_state() { 0 } else { osize };
|
||||
data.len() - 1
|
||||
- self.ntrans_len()
|
||||
- 1 // pack size
|
||||
- self.total_trans_size(sizes, ntrans)
|
||||
- (ntrans * osize) // output values
|
||||
- final_osize // final output
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn trans_addr(&self, node: &Node<'_>, i: usize) -> CompiledAddr {
|
||||
assert!(i < node.ntrans);
|
||||
let tsize = node.sizes.transition_pack_size();
|
||||
let at = node.start
|
||||
- self.ntrans_len()
|
||||
- 1 // pack size
|
||||
- self.trans_index_size(node.ntrans)
|
||||
- node.ntrans // inputs
|
||||
- (i * tsize) // the previous transition addresses
|
||||
- tsize; // the desired transition address
|
||||
unpack_delta(&node.data[at..], tsize, node.end)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn input(&self, node: &Node<'_>, i: usize) -> u8 {
|
||||
let at = node.start
|
||||
- self.ntrans_len()
|
||||
- 1 // pack size
|
||||
- self.trans_index_size(node.ntrans)
|
||||
- i
|
||||
- 1; // the input byte
|
||||
node.data[at]
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn find_input(&self, node: &Node<'_>, b: u8) -> Option<usize> {
|
||||
if node.ntrans > TRANS_INDEX_THRESHOLD {
|
||||
let start = node.start
|
||||
- self.ntrans_len()
|
||||
- 1 // pack size
|
||||
- self.trans_index_size(node.ntrans);
|
||||
let i = node.data[start + b as usize] as usize;
|
||||
if i >= node.ntrans {
|
||||
None
|
||||
} else {
|
||||
Some(i)
|
||||
}
|
||||
} else {
|
||||
let start = node.start
|
||||
- self.ntrans_len()
|
||||
- 1 // pack size
|
||||
- node.ntrans; // inputs
|
||||
let end = start + node.ntrans;
|
||||
let inputs = &node.data[start..end];
|
||||
inputs.iter().position(|&b2| b == b2).map(|i| node.ntrans - i - 1)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn output(&self, node: &Node<'_>, i: usize) -> Output {
|
||||
let osize = node.sizes.output_pack_size();
|
||||
if osize == 0 {
|
||||
return Output::zero();
|
||||
}
|
||||
let at = node.start
|
||||
- self.ntrans_len()
|
||||
- 1 // pack size
|
||||
- self.total_trans_size(node.sizes, node.ntrans)
|
||||
- (i * osize) // the previous outputs
|
||||
- osize; // the desired output value
|
||||
Output::new(bytes::unpack_uint(&node.data[at..], osize as u8))
|
||||
}
|
||||
}
|
||||
|
||||
// high 4 bits is transition address packed size.
|
||||
// low 4 bits is output value packed size.
|
||||
//
|
||||
// `0` is a legal value which means there are no transitions/outputs.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
struct PackSizes(u8);
|
||||
|
||||
impl PackSizes {
|
||||
#[inline]
|
||||
fn new() -> PackSizes {
|
||||
PackSizes(0)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn decode(v: u8) -> PackSizes {
|
||||
PackSizes(v)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn encode(&self) -> u8 {
|
||||
self.0
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn set_transition_pack_size(&mut self, size: u8) {
|
||||
assert!(size <= 8);
|
||||
self.0 = (self.0 & 0b0000_1111) | (size << 4);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn transition_pack_size(&self) -> usize {
|
||||
((self.0 & 0b1111_0000) >> 4) as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn set_output_pack_size(&mut self, size: u8) {
|
||||
assert!(size <= 8);
|
||||
self.0 = (self.0 & 0b1111_0000) | size;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn output_pack_size(&self) -> usize {
|
||||
(self.0 & 0b0000_1111) as usize
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over all transitions in a node.
|
||||
///
|
||||
/// `'f` is the lifetime of the underlying fst and `'n` is the lifetime of
|
||||
/// the underlying `Node`.
|
||||
pub struct Transitions<'f, 'n> {
|
||||
node: &'n Node<'f>,
|
||||
range: Range<usize>,
|
||||
}
|
||||
|
||||
impl<'f, 'n> Iterator for Transitions<'f, 'n> {
|
||||
type Item = Transition;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<Transition> {
|
||||
self.range.next().map(|i| self.node.transition(i))
|
||||
}
|
||||
}
|
||||
|
||||
/// common_idx translate a byte to an index in the COMMON_INPUTS_INV array.
|
||||
///
|
||||
/// I wonder if it would be prudent to store this mapping in the FST itself.
|
||||
/// The advantage of doing so would mean that common inputs would reflect the
|
||||
/// specific data in the FST. The problem of course is that this table has to
|
||||
/// be computed up front, which is pretty much at odds with the streaming
|
||||
/// nature of the builder.
|
||||
///
|
||||
/// Nevertheless, the *caller* may have a priori knowledge that could be
|
||||
/// supplied to the builder manually, which could then be embedded in the FST.
|
||||
#[inline]
|
||||
fn common_idx(input: u8, max: u8) -> u8 {
|
||||
let val = ((COMMON_INPUTS[input as usize] as u32 + 1) % 256) as u8;
|
||||
if val > max {
|
||||
0
|
||||
} else {
|
||||
val
|
||||
}
|
||||
}
|
||||
|
||||
/// common_input translates a common input index stored in a serialized FST
|
||||
/// to the corresponding byte.
|
||||
#[inline]
|
||||
fn common_input(idx: u8) -> Option<u8> {
|
||||
if idx == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(COMMON_INPUTS_INV[(idx - 1) as usize])
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn pack_delta<W: io::Write>(
|
||||
wtr: W,
|
||||
node_addr: CompiledAddr,
|
||||
trans_addr: CompiledAddr,
|
||||
) -> io::Result<u8> {
|
||||
let nbytes = pack_delta_size(node_addr, trans_addr);
|
||||
pack_delta_in(wtr, node_addr, trans_addr, nbytes)?;
|
||||
Ok(nbytes)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn pack_delta_in<W: io::Write>(
|
||||
wtr: W,
|
||||
node_addr: CompiledAddr,
|
||||
trans_addr: CompiledAddr,
|
||||
nbytes: u8,
|
||||
) -> io::Result<()> {
|
||||
let delta_addr = if trans_addr == EMPTY_ADDRESS {
|
||||
EMPTY_ADDRESS
|
||||
} else {
|
||||
node_addr - trans_addr
|
||||
};
|
||||
bytes::pack_uint_in(wtr, delta_addr as u64, nbytes)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn pack_delta_size(node_addr: CompiledAddr, trans_addr: CompiledAddr) -> u8 {
|
||||
let delta_addr = if trans_addr == EMPTY_ADDRESS {
|
||||
EMPTY_ADDRESS
|
||||
} else {
|
||||
node_addr - trans_addr
|
||||
};
|
||||
bytes::pack_size(delta_addr as u64)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn unpack_delta(
|
||||
slice: &[u8],
|
||||
trans_pack_size: usize,
|
||||
node_addr: usize,
|
||||
) -> CompiledAddr {
|
||||
let delta = bytes::unpack_uint(slice, trans_pack_size as u8);
|
||||
let delta_addr = u64_to_usize(delta);
|
||||
if delta_addr == EMPTY_ADDRESS {
|
||||
EMPTY_ADDRESS
|
||||
} else {
|
||||
node_addr - delta_addr
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use quickcheck::{quickcheck, TestResult};
|
||||
|
||||
use crate::raw::build::BuilderNode;
|
||||
use crate::raw::node::Node;
|
||||
use crate::raw::{Builder, CompiledAddr, Output, Transition};
|
||||
use crate::stream::Streamer;
|
||||
|
||||
const NEVER_LAST: CompiledAddr = std::u64::MAX as CompiledAddr;
|
||||
|
||||
#[test]
|
||||
fn prop_emits_inputs() {
|
||||
fn p(mut bs: Vec<Vec<u8>>) -> TestResult {
|
||||
bs.sort();
|
||||
bs.dedup();
|
||||
|
||||
let mut bfst = Builder::memory();
|
||||
for word in &bs {
|
||||
bfst.add(word).unwrap();
|
||||
}
|
||||
let fst = bfst.into_fst();
|
||||
let mut rdr = fst.stream();
|
||||
let mut words = vec![];
|
||||
while let Some(w) = rdr.next() {
|
||||
words.push(w.0.to_owned());
|
||||
}
|
||||
TestResult::from_bool(bs == words)
|
||||
}
|
||||
quickcheck(p as fn(Vec<Vec<u8>>) -> TestResult)
|
||||
}
|
||||
|
||||
fn nodes_equal(compiled: &Node, uncompiled: &BuilderNode) -> bool {
|
||||
println!("{:?}", compiled);
|
||||
assert_eq!(compiled.is_final(), uncompiled.is_final);
|
||||
assert_eq!(compiled.len(), uncompiled.trans.len());
|
||||
assert_eq!(compiled.final_output(), uncompiled.final_output);
|
||||
for (ct, ut) in
|
||||
compiled.transitions().zip(uncompiled.trans.iter().cloned())
|
||||
{
|
||||
assert_eq!(ct.inp, ut.inp);
|
||||
assert_eq!(ct.out, ut.out);
|
||||
assert_eq!(ct.addr, ut.addr);
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
fn compile(node: &BuilderNode) -> (CompiledAddr, Vec<u8>) {
|
||||
let mut buf = vec![0; 24];
|
||||
node.compile_to(&mut buf, NEVER_LAST, 24).unwrap();
|
||||
(buf.len() as CompiledAddr - 1, buf)
|
||||
}
|
||||
|
||||
fn roundtrip(bnode: &BuilderNode) -> bool {
|
||||
let (addr, bytes) = compile(bnode);
|
||||
let node = Node::new(addr, &bytes);
|
||||
nodes_equal(&node, bnode)
|
||||
}
|
||||
|
||||
fn trans(addr: CompiledAddr, inp: u8) -> Transition {
|
||||
Transition { inp, out: Output::zero(), addr }
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bin_no_trans() {
|
||||
let bnode = BuilderNode {
|
||||
is_final: false,
|
||||
final_output: Output::zero(),
|
||||
trans: vec![],
|
||||
};
|
||||
let (addr, buf) = compile(&bnode);
|
||||
let node = Node::new(addr, &buf);
|
||||
assert_eq!(node.as_slice().len(), 3);
|
||||
roundtrip(&bnode);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bin_one_trans_common() {
|
||||
let bnode = BuilderNode {
|
||||
is_final: false,
|
||||
final_output: Output::zero(),
|
||||
trans: vec![trans(20, b'a')],
|
||||
};
|
||||
let (addr, buf) = compile(&bnode);
|
||||
let node = Node::new(addr, &buf);
|
||||
assert_eq!(node.as_slice().len(), 3);
|
||||
roundtrip(&bnode);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bin_one_trans_not_common() {
|
||||
let bnode = BuilderNode {
|
||||
is_final: false,
|
||||
final_output: Output::zero(),
|
||||
trans: vec![trans(2, b'\xff')],
|
||||
};
|
||||
let (addr, buf) = compile(&bnode);
|
||||
let node = Node::new(addr, &buf);
|
||||
assert_eq!(node.as_slice().len(), 4);
|
||||
roundtrip(&bnode);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bin_many_trans() {
|
||||
let bnode = BuilderNode {
|
||||
is_final: false,
|
||||
final_output: Output::zero(),
|
||||
trans: vec![
|
||||
trans(2, b'a'),
|
||||
trans(3, b'b'),
|
||||
trans(4, b'c'),
|
||||
trans(5, b'd'),
|
||||
trans(6, b'e'),
|
||||
trans(7, b'f'),
|
||||
],
|
||||
};
|
||||
let (addr, buf) = compile(&bnode);
|
||||
let node = Node::new(addr, &buf);
|
||||
assert_eq!(node.as_slice().len(), 14);
|
||||
roundtrip(&bnode);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn node_max_trans() {
|
||||
let bnode = BuilderNode {
|
||||
is_final: false,
|
||||
final_output: Output::zero(),
|
||||
trans: (0..256).map(|i| trans(0, i as u8)).collect(),
|
||||
};
|
||||
let (addr, buf) = compile(&bnode);
|
||||
let node = Node::new(addr, &buf);
|
||||
assert_eq!(node.transitions().count(), 256);
|
||||
assert_eq!(node.len(), node.transitions().count());
|
||||
roundtrip(&bnode);
|
||||
}
|
||||
}
|
||||
264
crates/ufst/src/raw/registry.rs
Normal file
264
crates/ufst/src/raw/registry.rs
Normal file
@@ -0,0 +1,264 @@
|
||||
use crate::raw::build::BuilderNode;
|
||||
use crate::raw::{CompiledAddr, NONE_ADDRESS};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Registry {
|
||||
table: Vec<RegistryCell>,
|
||||
table_size: usize, // number of rows
|
||||
mru_size: usize, // number of columns
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct RegistryCache<'a> {
|
||||
cells: &'a mut [RegistryCell],
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RegistryCell {
|
||||
addr: CompiledAddr,
|
||||
node: BuilderNode,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum RegistryEntry<'a> {
|
||||
Found(CompiledAddr),
|
||||
NotFound(&'a mut RegistryCell),
|
||||
Rejected,
|
||||
}
|
||||
|
||||
impl Registry {
|
||||
pub fn new(table_size: usize, mru_size: usize) -> Registry {
|
||||
let empty_cell = RegistryCell::none();
|
||||
let ncells = table_size.checked_mul(mru_size).unwrap();
|
||||
Registry { table: vec![empty_cell; ncells], table_size, mru_size }
|
||||
}
|
||||
|
||||
pub fn entry<'a>(&'a mut self, node: &BuilderNode) -> RegistryEntry<'a> {
|
||||
if self.table.is_empty() {
|
||||
return RegistryEntry::Rejected;
|
||||
}
|
||||
let bucket = self.hash(node);
|
||||
let start = self.mru_size * bucket;
|
||||
let end = start + self.mru_size;
|
||||
RegistryCache { cells: &mut self.table[start..end] }.entry(node)
|
||||
}
|
||||
|
||||
fn hash(&self, node: &BuilderNode) -> usize {
|
||||
// Basic FNV-1a hash as described:
|
||||
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
|
||||
//
|
||||
// In unscientific experiments, this provides the same compression
|
||||
// as `std::hash::SipHasher` but is much much faster.
|
||||
const FNV_PRIME: u64 = 1099511628211;
|
||||
let mut h = 14695981039346656037;
|
||||
h = (h ^ (node.is_final as u64)).wrapping_mul(FNV_PRIME);
|
||||
h = (h ^ node.final_output.value()).wrapping_mul(FNV_PRIME);
|
||||
for t in &node.trans {
|
||||
h = (h ^ (t.inp as u64)).wrapping_mul(FNV_PRIME);
|
||||
h = (h ^ t.out.value()).wrapping_mul(FNV_PRIME);
|
||||
h = (h ^ (t.addr as u64)).wrapping_mul(FNV_PRIME);
|
||||
}
|
||||
(h as usize) % self.table_size
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> RegistryCache<'a> {
|
||||
fn entry(mut self, node: &BuilderNode) -> RegistryEntry<'a> {
|
||||
if self.cells.len() == 1 {
|
||||
let cell = &mut self.cells[0];
|
||||
if !cell.is_none() && &cell.node == node {
|
||||
RegistryEntry::Found(cell.addr)
|
||||
} else {
|
||||
cell.node.clone_from(node);
|
||||
RegistryEntry::NotFound(cell)
|
||||
}
|
||||
} else if self.cells.len() == 2 {
|
||||
let cell1 = &mut self.cells[0];
|
||||
if !cell1.is_none() && &cell1.node == node {
|
||||
return RegistryEntry::Found(cell1.addr);
|
||||
}
|
||||
|
||||
let cell2 = &mut self.cells[1];
|
||||
if !cell2.is_none() && &cell2.node == node {
|
||||
let addr = cell2.addr;
|
||||
self.cells.swap(0, 1);
|
||||
return RegistryEntry::Found(addr);
|
||||
}
|
||||
|
||||
self.cells[1].node.clone_from(node);
|
||||
self.cells.swap(0, 1);
|
||||
RegistryEntry::NotFound(&mut self.cells[0])
|
||||
} else {
|
||||
let find = |c: &RegistryCell| !c.is_none() && &c.node == node;
|
||||
if let Some(i) = self.cells.iter().position(find) {
|
||||
let addr = self.cells[i].addr;
|
||||
self.promote(i); // most recently used
|
||||
RegistryEntry::Found(addr)
|
||||
} else {
|
||||
let last = self.cells.len() - 1;
|
||||
self.cells[last].node.clone_from(node); // discard LRU
|
||||
self.promote(last);
|
||||
RegistryEntry::NotFound(&mut self.cells[0])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn promote(&mut self, mut i: usize) {
|
||||
assert!(i < self.cells.len());
|
||||
while i > 0 {
|
||||
self.cells.swap(i - 1, i);
|
||||
i -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl RegistryCell {
|
||||
fn none() -> RegistryCell {
|
||||
RegistryCell { addr: NONE_ADDRESS, node: BuilderNode::default() }
|
||||
}
|
||||
|
||||
fn is_none(&self) -> bool {
|
||||
self.addr == NONE_ADDRESS
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, addr: CompiledAddr) {
|
||||
self.addr = addr;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{Registry, RegistryCache, RegistryCell, RegistryEntry};
|
||||
use crate::raw::build::BuilderNode;
|
||||
use crate::raw::{Output, Transition};
|
||||
|
||||
fn assert_rejected(entry: RegistryEntry) {
|
||||
match entry {
|
||||
RegistryEntry::Rejected => {}
|
||||
entry => panic!("expected rejected entry, got: {:?}", entry),
|
||||
}
|
||||
}
|
||||
|
||||
fn assert_not_found(entry: RegistryEntry) {
|
||||
match entry {
|
||||
RegistryEntry::NotFound(_) => {}
|
||||
entry => panic!("expected nout found entry, got: {:?}", entry),
|
||||
}
|
||||
}
|
||||
|
||||
fn assert_insert_and_found(reg: &mut Registry, bnode: &BuilderNode) {
|
||||
match reg.entry(bnode) {
|
||||
RegistryEntry::NotFound(cell) => cell.insert(1234),
|
||||
entry => panic!("unexpected not found entry, got: {:?}", entry),
|
||||
}
|
||||
match reg.entry(bnode) {
|
||||
RegistryEntry::Found(addr) => assert_eq!(addr, 1234),
|
||||
entry => panic!("unexpected found entry, got: {:?}", entry),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_is_ok() {
|
||||
let mut reg = Registry::new(0, 0);
|
||||
let bnode = BuilderNode {
|
||||
is_final: false,
|
||||
final_output: Output::zero(),
|
||||
trans: vec![],
|
||||
};
|
||||
assert_rejected(reg.entry(&bnode));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_final_is_ok() {
|
||||
let mut reg = Registry::new(1, 1);
|
||||
let bnode = BuilderNode {
|
||||
is_final: true,
|
||||
final_output: Output::zero(),
|
||||
trans: vec![],
|
||||
};
|
||||
assert_insert_and_found(&mut reg, &bnode);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_with_trans_is_ok() {
|
||||
let mut reg = Registry::new(1, 1);
|
||||
let bnode = BuilderNode {
|
||||
is_final: false,
|
||||
final_output: Output::zero(),
|
||||
trans: vec![Transition {
|
||||
addr: 0,
|
||||
inp: b'a',
|
||||
out: Output::zero(),
|
||||
}],
|
||||
};
|
||||
assert_insert_and_found(&mut reg, &bnode);
|
||||
assert_not_found(
|
||||
reg.entry(&BuilderNode { is_final: true, ..bnode.clone() }),
|
||||
);
|
||||
assert_not_found(reg.entry(&BuilderNode {
|
||||
trans: vec![Transition {
|
||||
addr: 0,
|
||||
inp: b'b',
|
||||
out: Output::zero(),
|
||||
}],
|
||||
..bnode.clone()
|
||||
}));
|
||||
assert_not_found(reg.entry(&BuilderNode {
|
||||
trans: vec![Transition {
|
||||
addr: 0,
|
||||
inp: b'a',
|
||||
out: Output::new(1),
|
||||
}],
|
||||
..bnode.clone()
|
||||
}));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cache_works() {
|
||||
let mut reg = Registry::new(1, 1);
|
||||
|
||||
let bnode1 = BuilderNode { is_final: true, ..BuilderNode::default() };
|
||||
assert_insert_and_found(&mut reg, &bnode1);
|
||||
|
||||
let bnode2 =
|
||||
BuilderNode { final_output: Output::new(1), ..bnode1.clone() };
|
||||
assert_insert_and_found(&mut reg, &bnode2);
|
||||
assert_not_found(reg.entry(&bnode1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn promote() {
|
||||
let bn = BuilderNode::default();
|
||||
let mut bnodes = vec![
|
||||
RegistryCell { addr: 1, node: bn.clone() },
|
||||
RegistryCell { addr: 2, node: bn.clone() },
|
||||
RegistryCell { addr: 3, node: bn.clone() },
|
||||
RegistryCell { addr: 4, node: bn },
|
||||
];
|
||||
let mut cache = RegistryCache { cells: &mut bnodes };
|
||||
|
||||
cache.promote(0);
|
||||
assert_eq!(cache.cells[0].addr, 1);
|
||||
assert_eq!(cache.cells[1].addr, 2);
|
||||
assert_eq!(cache.cells[2].addr, 3);
|
||||
assert_eq!(cache.cells[3].addr, 4);
|
||||
|
||||
cache.promote(1);
|
||||
assert_eq!(cache.cells[0].addr, 2);
|
||||
assert_eq!(cache.cells[1].addr, 1);
|
||||
assert_eq!(cache.cells[2].addr, 3);
|
||||
assert_eq!(cache.cells[3].addr, 4);
|
||||
|
||||
cache.promote(3);
|
||||
assert_eq!(cache.cells[0].addr, 4);
|
||||
assert_eq!(cache.cells[1].addr, 2);
|
||||
assert_eq!(cache.cells[2].addr, 1);
|
||||
assert_eq!(cache.cells[3].addr, 3);
|
||||
|
||||
cache.promote(2);
|
||||
assert_eq!(cache.cells[0].addr, 1);
|
||||
assert_eq!(cache.cells[1].addr, 4);
|
||||
assert_eq!(cache.cells[2].addr, 2);
|
||||
assert_eq!(cache.cells[3].addr, 3);
|
||||
}
|
||||
}
|
||||
53
crates/ufst/src/raw/registry_minimal.rs
Normal file
53
crates/ufst/src/raw/registry_minimal.rs
Normal file
@@ -0,0 +1,53 @@
|
||||
// This module is a drop-in but inefficient replacement of the LRU registry.
|
||||
// In particular, this registry will never forget a node. In other words, if
|
||||
// this registry is used during construction, then you're guaranteed a minimal
|
||||
// FST.
|
||||
//
|
||||
// This is really only meant to be used for debugging and experiments. It is
|
||||
// a memory/CPU hog.
|
||||
//
|
||||
// One "easy" improvement here is to use an FNV hash instead of the super
|
||||
// expensive SipHasher.
|
||||
|
||||
#![allow(dead_code)]
|
||||
|
||||
use std::collections::hash_map::{Entry, HashMap};
|
||||
|
||||
use crate::raw::build::BuilderNode;
|
||||
use crate::raw::CompiledAddr;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Registry {
|
||||
table: HashMap<BuilderNode, RegistryCell>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum RegistryEntry<'a> {
|
||||
Found(CompiledAddr),
|
||||
NotFound(&'a mut RegistryCell),
|
||||
Rejected,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct RegistryCell(CompiledAddr);
|
||||
|
||||
impl Registry {
|
||||
pub fn new(table_size: usize, _lru_size: usize) -> Registry {
|
||||
Registry { table: HashMap::with_capacity(table_size) }
|
||||
}
|
||||
|
||||
pub fn entry<'a>(&'a mut self, bnode: &BuilderNode) -> RegistryEntry<'a> {
|
||||
match self.table.entry(bnode.clone()) {
|
||||
Entry::Occupied(v) => RegistryEntry::Found(v.get().0),
|
||||
Entry::Vacant(v) => {
|
||||
RegistryEntry::NotFound(v.insert(RegistryCell(0)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl RegistryCell {
|
||||
pub fn insert(&mut self, addr: CompiledAddr) {
|
||||
self.0 = addr;
|
||||
}
|
||||
}
|
||||
532
crates/ufst/src/raw/tests.rs
Normal file
532
crates/ufst/src/raw/tests.rs
Normal file
@@ -0,0 +1,532 @@
|
||||
use crate::automaton::AlwaysMatch;
|
||||
use crate::error::Error;
|
||||
use crate::raw::{self, Bound, Builder, Fst, Output, Stream};
|
||||
use crate::stream::Streamer;
|
||||
|
||||
const TEXT: &str = include_str!("./../../data/words-100000");
|
||||
|
||||
pub fn fst_set<I, S>(ss: I) -> Fst<Vec<u8>>
|
||||
where
|
||||
I: IntoIterator<Item = S>,
|
||||
S: AsRef<[u8]>,
|
||||
{
|
||||
let mut bfst = Builder::memory();
|
||||
let mut ss: Vec<Vec<u8>> =
|
||||
ss.into_iter().map(|s| s.as_ref().to_vec()).collect();
|
||||
ss.sort();
|
||||
ss.dedup();
|
||||
for s in ss.iter() {
|
||||
bfst.add(s).unwrap();
|
||||
}
|
||||
let fst = bfst.into_fst();
|
||||
assert_eq!(fst.len(), ss.len());
|
||||
fst
|
||||
}
|
||||
|
||||
pub fn fst_map<I, S>(ss: I) -> Fst<Vec<u8>>
|
||||
where
|
||||
I: IntoIterator<Item = (S, u64)>,
|
||||
S: AsRef<[u8]>,
|
||||
{
|
||||
let mut bfst = Builder::memory();
|
||||
let mut ss: Vec<(Vec<u8>, u64)> =
|
||||
ss.into_iter().map(|(s, o)| (s.as_ref().to_vec(), o)).collect();
|
||||
ss.sort();
|
||||
ss.dedup();
|
||||
for (s, o) in ss.into_iter() {
|
||||
bfst.insert(s, o).unwrap();
|
||||
}
|
||||
bfst.into_fst()
|
||||
}
|
||||
|
||||
pub fn fst_inputs<D: AsRef<[u8]>>(fst: &Fst<D>) -> Vec<Vec<u8>> {
|
||||
let mut words = vec![];
|
||||
let mut rdr = fst.stream();
|
||||
while let Some((word, _)) = rdr.next() {
|
||||
words.push(word.to_vec());
|
||||
}
|
||||
words
|
||||
}
|
||||
|
||||
pub fn fst_inputs_outputs<D: AsRef<[u8]>>(
|
||||
fst: &Fst<D>,
|
||||
) -> Vec<(Vec<u8>, u64)> {
|
||||
let mut words = vec![];
|
||||
let mut rdr = fst.stream();
|
||||
while let Some((word, out)) = rdr.next() {
|
||||
words.push((word.to_vec(), out.value()));
|
||||
}
|
||||
words
|
||||
}
|
||||
|
||||
macro_rules! test_set {
|
||||
($name:ident, $($s:expr),+) => {
|
||||
#[test]
|
||||
fn $name() {
|
||||
let mut items = vec![$($s),*];
|
||||
let fst = fst_set(&items);
|
||||
let mut rdr = fst.stream();
|
||||
items.sort_unstable();
|
||||
items.dedup();
|
||||
for item in &items {
|
||||
assert_eq!(rdr.next().unwrap().0, item.as_bytes());
|
||||
}
|
||||
assert_eq!(rdr.next(), None);
|
||||
for item in &items {
|
||||
assert!(fst.get(item).is_some());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! test_set_fail {
|
||||
($name:ident, $($s:expr),+) => {
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn $name() {
|
||||
let mut bfst = Builder::memory();
|
||||
$(bfst.add($s).unwrap();)*
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_set!(fst_set_only_empty, "");
|
||||
test_set!(fst_set_one, "a");
|
||||
test_set!(fst_set_dupe_empty, "", "");
|
||||
test_set!(fst_set_dupe1, "a", "a");
|
||||
test_set!(fst_set_dupe2, "a", "b", "b");
|
||||
test_set!(fst_set_two1, "a", "b");
|
||||
test_set!(fst_set_two2, "a", "ab");
|
||||
test_set!(fst_set_jan, "jam", "jbm", "jcm", "jdm", "jem", "jfm", "jgm");
|
||||
|
||||
test_set_fail!(fst_set_order1, "b", "a");
|
||||
test_set_fail!(fst_set_order2, "a", "b", "c", "a");
|
||||
|
||||
#[test]
|
||||
fn fst_set_100000() {
|
||||
let words: Vec<Vec<u8>> =
|
||||
TEXT.lines().map(|s| s.as_bytes().to_vec()).collect();
|
||||
let fst = fst_set(words.clone());
|
||||
assert_eq!(words, fst_inputs(&fst));
|
||||
for word in &words {
|
||||
assert!(
|
||||
fst.get(word).is_some(),
|
||||
"failed to find word: {}",
|
||||
std::str::from_utf8(word).unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! test_map {
|
||||
($name:ident, $($s:expr, $o:expr),+) => {
|
||||
#[test]
|
||||
fn $name() {
|
||||
let fst = fst_map(vec![$(($s, $o)),*]);
|
||||
let mut rdr = fst.stream();
|
||||
$({
|
||||
let (s, o) = rdr.next().unwrap();
|
||||
assert_eq!((s, o.value()), ($s.as_bytes(), $o));
|
||||
})*
|
||||
assert_eq!(rdr.next(), None);
|
||||
$({
|
||||
assert_eq!(fst.get($s.as_bytes()), Some(Output::new($o)));
|
||||
})*
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! test_map_fail {
|
||||
($name:ident, $($s:expr, $o:expr),+) => {
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn $name() {
|
||||
let mut bfst = Builder::memory();
|
||||
$(bfst.insert($s, $o).unwrap();)*
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_map!(fst_map_only_empty1, "", 0);
|
||||
test_map!(fst_map_only_empty2, "", 100);
|
||||
test_map!(fst_map_only_empty3, "", 9999999999);
|
||||
test_map!(fst_map_one1, "a", 0);
|
||||
test_map!(fst_map_one2, "a", 100);
|
||||
test_map!(fst_map_one3, "a", 999999999);
|
||||
test_map!(fst_map_two, "a", 1, "b", 2);
|
||||
test_map!(fst_map_many1, "a", 34786, "ab", 26);
|
||||
test_map!(
|
||||
fst_map_many2,
|
||||
"a",
|
||||
34786,
|
||||
"ab",
|
||||
26,
|
||||
"abc",
|
||||
58976,
|
||||
"abcd",
|
||||
25,
|
||||
"z",
|
||||
58,
|
||||
"zabc",
|
||||
6798
|
||||
);
|
||||
test_map!(fst_map_many3, "a", 1, "ab", 0, "abc", 0);
|
||||
|
||||
test_map_fail!(fst_map_dupe_empty, "", 0, "", 0);
|
||||
test_map_fail!(fst_map_dupe1, "a", 0, "a", 0);
|
||||
test_map_fail!(fst_map_dupe2, "a", 0, "b", 0, "b", 0);
|
||||
test_map_fail!(fst_map_order1, "b", 0, "a", 0);
|
||||
test_map_fail!(fst_map_order2, "a", 0, "b", 0, "c", 0, "a", 0);
|
||||
|
||||
#[test]
|
||||
fn fst_map_100000_increments() {
|
||||
let words: Vec<(Vec<u8>, u64)> = TEXT
|
||||
.lines()
|
||||
.enumerate()
|
||||
.map(|(i, s)| (s.as_bytes().to_vec(), i as u64))
|
||||
.collect();
|
||||
let fst = fst_map(words.clone());
|
||||
assert_eq!(words, fst_inputs_outputs(&fst));
|
||||
for &(ref word, out) in &words {
|
||||
assert_eq!(fst.get(word), Some(Output::new(out)));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fst_map_100000_lengths() {
|
||||
let words: Vec<(Vec<u8>, u64)> = TEXT
|
||||
.lines()
|
||||
.map(|s| (s.as_bytes().to_vec(), s.len() as u64))
|
||||
.collect();
|
||||
let fst = fst_map(words.clone());
|
||||
assert_eq!(words, fst_inputs_outputs(&fst));
|
||||
for &(ref word, out) in &words {
|
||||
assert_eq!(fst.get(word), Some(Output::new(out)));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalid_format() {
|
||||
match Fst::new(vec![0; 0]) {
|
||||
Err(Error::Fst(raw::Error::Format { .. })) => {}
|
||||
Err(err) => panic!("expected format error, got {:?}", err),
|
||||
Ok(_) => panic!("expected format error, got FST"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fst_set_zero() {
|
||||
let fst = fst_set::<_, String>(vec![]);
|
||||
let mut rdr = fst.stream();
|
||||
assert_eq!(rdr.next(), None);
|
||||
}
|
||||
|
||||
macro_rules! test_range {
|
||||
(
|
||||
$name:ident,
|
||||
min: $min:expr,
|
||||
max: $max:expr,
|
||||
imin: $imin:expr,
|
||||
imax: $imax:expr,
|
||||
$($s:expr),*
|
||||
) => {
|
||||
#[test]
|
||||
fn $name() {
|
||||
let items: Vec<&'static str> = vec![$($s),*];
|
||||
let items: Vec<_> = items
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(i, k)| (k, i as u64))
|
||||
.collect();
|
||||
let fst = fst_map(items.clone());
|
||||
let mut rdr = Stream::new(fst.as_ref(), AlwaysMatch, $min, $max);
|
||||
#[allow(clippy::reversed_empty_ranges)]
|
||||
for i in $imin..$imax {
|
||||
assert_eq!(
|
||||
rdr.next().unwrap(),
|
||||
(items[i].0.as_bytes(), Output::new(items[i].1)),
|
||||
);
|
||||
}
|
||||
assert_eq!(rdr.next(), None);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_empty_1,
|
||||
min: Bound::Unbounded, max: Bound::Unbounded,
|
||||
imin: 0, imax: 0,
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_empty_2,
|
||||
min: Bound::Unbounded, max: Bound::Unbounded,
|
||||
imin: 0, imax: 1,
|
||||
""
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_empty_3,
|
||||
min: Bound::Included(vec![]), max: Bound::Unbounded,
|
||||
imin: 0, imax: 1,
|
||||
""
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_empty_4,
|
||||
min: Bound::Excluded(vec![]), max: Bound::Unbounded,
|
||||
imin: 0, imax: 0,
|
||||
""
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_empty_5,
|
||||
min: Bound::Included(vec![]), max: Bound::Unbounded,
|
||||
imin: 0, imax: 2,
|
||||
"", "a"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_empty_6,
|
||||
min: Bound::Excluded(vec![]), max: Bound::Unbounded,
|
||||
imin: 1, imax: 2,
|
||||
"", "a"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_empty_7,
|
||||
min: Bound::Unbounded, max: Bound::Unbounded,
|
||||
imin: 0, imax: 2,
|
||||
"", "a"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_empty_8,
|
||||
min: Bound::Unbounded, max: Bound::Included(vec![]),
|
||||
imin: 0, imax: 1,
|
||||
""
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_empty_9,
|
||||
min: Bound::Unbounded, max: Bound::Excluded(vec![]),
|
||||
imin: 0, imax: 0,
|
||||
""
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_empty_10,
|
||||
min: Bound::Unbounded, max: Bound::Included(vec![]),
|
||||
imin: 0, imax: 1,
|
||||
"", "a"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_empty_11,
|
||||
min: Bound::Included(vec![]), max: Bound::Included(vec![]),
|
||||
imin: 0, imax: 1,
|
||||
""
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_1,
|
||||
min: Bound::Included(vec![b'a']), max: Bound::Included(vec![b'z']),
|
||||
imin: 0, imax: 4,
|
||||
"a", "b", "y", "z"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_2,
|
||||
min: Bound::Excluded(vec![b'a']), max: Bound::Included(vec![b'y']),
|
||||
imin: 1, imax: 3,
|
||||
"a", "b", "y", "z"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_3,
|
||||
min: Bound::Excluded(vec![b'a']), max: Bound::Excluded(vec![b'y']),
|
||||
imin: 1, imax: 2,
|
||||
"a", "b", "y", "z"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_4,
|
||||
min: Bound::Unbounded, max: Bound::Unbounded,
|
||||
imin: 0, imax: 4,
|
||||
"a", "b", "y", "z"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_5,
|
||||
min: Bound::Included(b"abd".to_vec()), max: Bound::Unbounded,
|
||||
imin: 0, imax: 0,
|
||||
"a", "ab", "abc", "abcd", "abcde"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_6,
|
||||
min: Bound::Included(b"abd".to_vec()), max: Bound::Unbounded,
|
||||
imin: 5, imax: 6,
|
||||
"a", "ab", "abc", "abcd", "abcde", "abe"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_7,
|
||||
min: Bound::Excluded(b"abd".to_vec()), max: Bound::Unbounded,
|
||||
imin: 5, imax: 6,
|
||||
"a", "ab", "abc", "abcd", "abcde", "abe"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_8,
|
||||
min: Bound::Included(b"abd".to_vec()), max: Bound::Unbounded,
|
||||
imin: 5, imax: 6,
|
||||
"a", "ab", "abc", "abcd", "abcde", "xyz"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_9,
|
||||
min: Bound::Unbounded, max: Bound::Included(b"abd".to_vec()),
|
||||
imin: 0, imax: 5,
|
||||
"a", "ab", "abc", "abcd", "abcde", "abe"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_10,
|
||||
min: Bound::Unbounded, max: Bound::Included(b"abd".to_vec()),
|
||||
imin: 0, imax: 6,
|
||||
"a", "ab", "abc", "abcd", "abcde", "abd"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_11,
|
||||
min: Bound::Unbounded, max: Bound::Included(b"abd".to_vec()),
|
||||
imin: 0, imax: 6,
|
||||
"a", "ab", "abc", "abcd", "abcde", "abd", "abdx"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_12,
|
||||
min: Bound::Unbounded, max: Bound::Excluded(b"abd".to_vec()),
|
||||
imin: 0, imax: 5,
|
||||
"a", "ab", "abc", "abcd", "abcde", "abe"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_13,
|
||||
min: Bound::Unbounded, max: Bound::Excluded(b"abd".to_vec()),
|
||||
imin: 0, imax: 5,
|
||||
"a", "ab", "abc", "abcd", "abcde", "abd"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_14,
|
||||
min: Bound::Unbounded, max: Bound::Excluded(b"abd".to_vec()),
|
||||
imin: 0, imax: 5,
|
||||
"a", "ab", "abc", "abcd", "abcde", "abd", "abdx"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_15,
|
||||
min: Bound::Included(vec![b'd']), max: Bound::Included(vec![b'c']),
|
||||
imin: 0, imax: 0,
|
||||
"a", "b", "c", "d", "e", "f"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_16,
|
||||
min: Bound::Included(vec![b'c']), max: Bound::Included(vec![b'c']),
|
||||
imin: 2, imax: 3,
|
||||
"a", "b", "c", "d", "e", "f"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_17,
|
||||
min: Bound::Excluded(vec![b'c']), max: Bound::Excluded(vec![b'c']),
|
||||
imin: 0, imax: 0,
|
||||
"a", "b", "c", "d", "e", "f"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_18,
|
||||
min: Bound::Included(vec![b'c']), max: Bound::Excluded(vec![b'c']),
|
||||
imin: 0, imax: 0,
|
||||
"a", "b", "c", "d", "e", "f"
|
||||
}
|
||||
|
||||
test_range! {
|
||||
fst_range_19,
|
||||
min: Bound::Included(vec![b'c']), max: Bound::Excluded(vec![b'd']),
|
||||
imin: 2, imax: 3,
|
||||
"a", "b", "c", "d", "e", "f"
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_vec_multiple_fsts() {
|
||||
let mut bfst1 = Builder::memory();
|
||||
bfst1.add(b"bar").unwrap();
|
||||
bfst1.add(b"baz").unwrap();
|
||||
let bytes = bfst1.into_inner().unwrap();
|
||||
let fst1_len = bytes.len();
|
||||
|
||||
let mut bfst2 = Builder::new(bytes).unwrap();
|
||||
bfst2.add(b"bar").unwrap();
|
||||
bfst2.add(b"foo").unwrap();
|
||||
|
||||
let bytes = bfst2.into_inner().unwrap();
|
||||
let slice1 = &bytes[0..fst1_len];
|
||||
let slice2 = &bytes[fst1_len..bytes.len()];
|
||||
|
||||
let fst1 = Fst::new(slice1).unwrap();
|
||||
let fst2 = Fst::new(slice2).unwrap();
|
||||
|
||||
assert_eq!(fst_inputs(&fst1), vec![b"bar".to_vec(), b"baz".to_vec()]);
|
||||
assert_eq!(fst_inputs(&fst2), vec![b"bar".to_vec(), b"foo".to_vec()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bytes_written() {
|
||||
let mut bfst1 = Builder::memory();
|
||||
bfst1.add(b"bar").unwrap();
|
||||
bfst1.add(b"baz").unwrap();
|
||||
let counted_len = bfst1.bytes_written();
|
||||
let bytes = bfst1.into_inner().unwrap();
|
||||
let fst1_len = bytes.len() as u64;
|
||||
let footer_size = 24;
|
||||
assert_eq!(counted_len + footer_size, fst1_len);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn get_key_simple() {
|
||||
let map = fst_map(vec![("abc", 2), ("xyz", 3)]);
|
||||
assert_eq!(map.get_key(0), None);
|
||||
assert_eq!(map.get_key(1), None);
|
||||
assert_eq!(map.get_key(2), Some(b"abc".to_vec()));
|
||||
assert_eq!(map.get_key(3), Some(b"xyz".to_vec()));
|
||||
assert_eq!(map.get_key(4), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn get_key_words() {
|
||||
let words: Vec<(Vec<u8>, u64)> = TEXT
|
||||
.lines()
|
||||
.enumerate()
|
||||
.map(|(i, line)| (line.as_bytes().to_vec(), i as u64))
|
||||
.collect();
|
||||
let map = fst_map(words.clone());
|
||||
for (key, value) in words {
|
||||
assert_eq!(map.get_key(value), Some(key));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn get_key_words_discontiguous() {
|
||||
let words: Vec<(Vec<u8>, u64)> = TEXT
|
||||
.lines()
|
||||
.enumerate()
|
||||
.map(|(i, line)| (line.as_bytes().to_vec(), i as u64 * 2))
|
||||
.collect();
|
||||
let map = fst_map(words.clone());
|
||||
for (key, value) in words {
|
||||
assert_eq!(map.get_key(value), Some(key));
|
||||
}
|
||||
}
|
||||
130
crates/ufst/src/stream.rs
Normal file
130
crates/ufst/src/stream.rs
Normal file
@@ -0,0 +1,130 @@
|
||||
/// Streamer describes a "streaming iterator."
|
||||
///
|
||||
/// It provides a mechanism for writing code that is generic over streams
|
||||
/// produced by this crate.
|
||||
///
|
||||
/// Note that this is strictly less useful than `Iterator` because the item
|
||||
/// associated type is bound to a specific lifetime. However, this does permit
|
||||
/// us to write *some* generic code over streams that produce values tied
|
||||
/// to the lifetime of the stream.
|
||||
///
|
||||
/// Some form of stream abstraction is inherently required for this crate
|
||||
/// because elements in a finite state transducer are produced *by iterating*
|
||||
/// over the structure. The alternative would be to create a new allocation
|
||||
/// for each element iterated over, which would be prohibitively expensive.
|
||||
///
|
||||
/// # Usage & motivation
|
||||
///
|
||||
/// Streams are hard to use because they don't fit into Rust's current type
|
||||
/// system very well. They are so hard to use that this author loathes having a
|
||||
/// publically defined trait for it. Nevertheless, they do just barely provide
|
||||
/// a means for composing multiple stream abstractions with different concrete
|
||||
/// types. For example, one might want to take the union of a range query
|
||||
/// stream with a stream that has been filtered by a regex. These streams have
|
||||
/// different concrete types. A `Streamer` trait allows us to write code that
|
||||
/// is generic over these concrete types. (All of the set operations are
|
||||
/// implemented this way.)
|
||||
///
|
||||
/// A problem with streams is that the trait is itself parameterized by a
|
||||
/// lifetime. In practice, this makes them very unergonomic because specifying
|
||||
/// a `Streamer` bound generally requires a higher-ranked trait bound. This is
|
||||
/// necessary because the lifetime can't actually be named in the enclosing
|
||||
/// function; instead, the lifetime is local to iteration itself. Therefore,
|
||||
/// one must assert that the bound is valid for *any particular* lifetime.
|
||||
/// This is the essence of higher-rank trait bounds.
|
||||
///
|
||||
/// Because of this, you might expect to see lots of bounds that look like
|
||||
/// this:
|
||||
///
|
||||
/// ```ignore
|
||||
/// fn takes_stream<T, S>(s: S)
|
||||
/// where S: for<'a> Streamer<'a, Item=T>
|
||||
/// {
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// There are *three* different problems with this declaration:
|
||||
///
|
||||
/// 1. `S` is not bound by any particular lifetime itself, and most streams
|
||||
/// probably contain a reference to an underlying finite state transducer.
|
||||
/// 2. It is often convenient to separate the notion of "stream" with
|
||||
/// "stream constructor." This represents a similar split found in the
|
||||
/// standard library for `Iterator` and `IntoIterator`, respectively.
|
||||
/// 3. The `Item=T` is invalid because `Streamer`'s associated type is
|
||||
/// parameterized by a lifetime and there is no way to parameterize an
|
||||
/// arbitrary type constructor. (In this context, `T` is the type
|
||||
/// constructor, because it will invariably require a lifetime to become
|
||||
/// a concrete type.)
|
||||
///
|
||||
/// With that said, we must revise our possibly-workable bounds to a giant
|
||||
/// scary monster:
|
||||
///
|
||||
/// ```ignore
|
||||
/// fn takes_stream<'f, I, S>(s: I)
|
||||
/// where I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], Output)>,
|
||||
/// S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], Output)>
|
||||
/// {
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// We addressed the above points correspondingly:
|
||||
///
|
||||
/// 1. `S` is now bound by `'f`, which corresponds to the lifetime (possibly
|
||||
/// `'static`) of the underlying stream.
|
||||
/// 2. The `I` type parameter has been added to refer to a type that knows how
|
||||
/// to build a stream. Notice that neither of the bounds for `I` or `S`
|
||||
/// share a lifetime parameter. This is because the higher rank trait bound
|
||||
/// specifies it works for *any* particular lifetime.
|
||||
/// 3. `T` has been replaced with specific concrete types. Note that these
|
||||
/// concrete types are duplicated. With iterators, we could use
|
||||
/// `Item=S::Item` in the bound for `I`, but one cannot access an associated
|
||||
/// type through a higher-ranked trait bound. Therefore, we must duplicate
|
||||
/// the item type.
|
||||
///
|
||||
/// As you can see, streams offer little flexibility, little ergonomics and a
|
||||
/// lot of hard to read trait bounds. The situation is lamentable, but
|
||||
/// nevertheless, without them, we would not be able to compose streams by
|
||||
/// leveraging the type system.
|
||||
///
|
||||
/// A redeemable quality is that these *same exact* trait bounds (modulo some
|
||||
/// tweaks in the `Item` associated type) appear in many places in this crate
|
||||
/// without much variation. Therefore, once you grok it, it's mostly easy to
|
||||
/// pattern match it with "oh I need a stream." My hope is that clear
|
||||
/// documentation and examples make these complex bounds easier to burden.
|
||||
///
|
||||
/// Stretching this abstraction further with Rust's current type system is not
|
||||
/// advised.
|
||||
pub trait Streamer<'a> {
|
||||
/// The type of the item emitted by this stream.
|
||||
type Item: 'a;
|
||||
|
||||
/// Emits the next element in this stream, or `None` to indicate the stream
|
||||
/// has been exhausted.
|
||||
///
|
||||
/// It is not specified what a stream does after `None` is emitted. In most
|
||||
/// cases, `None` should be emitted on every subsequent call.
|
||||
fn next(&'a mut self) -> Option<Self::Item>;
|
||||
}
|
||||
|
||||
/// IntoStreamer describes types that can be converted to streams.
|
||||
///
|
||||
/// This is analogous to the `IntoIterator` trait for `Iterator` in
|
||||
/// `std::iter`.
|
||||
pub trait IntoStreamer<'a> {
|
||||
/// The type of the item emitted by the stream.
|
||||
type Item: 'a;
|
||||
/// The type of the stream to be constructed.
|
||||
type Into: Streamer<'a, Item = Self::Item>;
|
||||
|
||||
/// Construct a stream from `Self`.
|
||||
fn into_stream(self) -> Self::Into;
|
||||
}
|
||||
|
||||
impl<'a, S: Streamer<'a>> IntoStreamer<'a> for S {
|
||||
type Item = S::Item;
|
||||
type Into = S;
|
||||
|
||||
fn into_stream(self) -> S {
|
||||
self
|
||||
}
|
||||
}
|
||||
16
crates/unf/Cargo.toml
Normal file
16
crates/unf/Cargo.toml
Normal file
@@ -0,0 +1,16 @@
|
||||
[package]
|
||||
name = "unf"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
fst = "0.4.7"
|
||||
tinyvec = { version = "1.6.0", features = ["alloc"] }
|
||||
ufst = { path = "../ufst" }
|
||||
|
||||
[build-dependencies]
|
||||
ufst = { path = "../ufst" }
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1.0.0"
|
||||
similar-asserts = "1.2.0"
|
||||
66
crates/unf/build.rs
Normal file
66
crates/unf/build.rs
Normal file
@@ -0,0 +1,66 @@
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
use ufst::raw::Fst;
|
||||
|
||||
fn main() {
|
||||
let data = fs::read_to_string("data/UnicodeData.txt").unwrap();
|
||||
|
||||
let out_dir = env::var_os("OUT_DIR").unwrap();
|
||||
let dest_path = Path::new(&out_dir).join("table.fst");
|
||||
|
||||
let mut entries = parse(&data)
|
||||
.into_iter()
|
||||
.map(|(code_value, entry)| (code_value.to_ne_bytes(), entry))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
entries.sort_unstable_by_key(|(k, _)| *k);
|
||||
|
||||
let data = Fst::from_iter_map(entries).unwrap().into_inner();
|
||||
|
||||
fs::write(&dest_path, data).unwrap();
|
||||
|
||||
println!("cargo:rerun-if-changed=data/UnicodeData.txt");
|
||||
println!("cargo:rerun-if-changed=build.rs");
|
||||
}
|
||||
|
||||
fn parse(data: &str) -> Vec<(u32, u64)> {
|
||||
let mut entries = Vec::new();
|
||||
|
||||
for line in data.lines() {
|
||||
let mut iter = line.split(';');
|
||||
|
||||
let code_point = iter
|
||||
.next()
|
||||
.map(|s| u32::from_str_radix(s, 16).expect("valid u32"))
|
||||
.expect("code value");
|
||||
|
||||
let combining_class = iter
|
||||
.nth(2)
|
||||
.map(|s| s.parse::<u8>().expect("valid u8"))
|
||||
.expect("canonical combining classes");
|
||||
|
||||
let mut entry = combining_class as u64;
|
||||
|
||||
let decomposition_mapping = iter.nth(1).unwrap();
|
||||
|
||||
if !decomposition_mapping.starts_with('<') {
|
||||
let mappings = decomposition_mapping
|
||||
.split(' ')
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(|s| u32::from_str_radix(s, 16).expect("valid u32"))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert!(mappings.len() <= 2);
|
||||
|
||||
for (i, mapping) in mappings.into_iter().enumerate() {
|
||||
entry |= (mapping as u64) << ((21 * i) + 8);
|
||||
}
|
||||
}
|
||||
|
||||
entries.push((code_point, entry));
|
||||
}
|
||||
|
||||
entries
|
||||
}
|
||||
9953
crates/unf/data/DerivedNormalizationProps.txt
Normal file
9953
crates/unf/data/DerivedNormalizationProps.txt
Normal file
File diff suppressed because it is too large
Load Diff
19047
crates/unf/data/NormalizationTest.txt
Normal file
19047
crates/unf/data/NormalizationTest.txt
Normal file
File diff suppressed because it is too large
Load Diff
1276
crates/unf/data/StandardizedVariants.txt
Normal file
1276
crates/unf/data/StandardizedVariants.txt
Normal file
File diff suppressed because it is too large
Load Diff
34626
crates/unf/data/UnicodeData.txt
Normal file
34626
crates/unf/data/UnicodeData.txt
Normal file
File diff suppressed because it is too large
Load Diff
309
crates/unf/src/lib.rs
Normal file
309
crates/unf/src/lib.rs
Normal file
@@ -0,0 +1,309 @@
|
||||
use std::iter::Fuse;
|
||||
use std::ops::Range;
|
||||
use std::str::Chars;
|
||||
|
||||
use tinyvec::TinyVec;
|
||||
|
||||
pub mod table;
|
||||
|
||||
use table::Decomposition;
|
||||
|
||||
pub fn nfd(s: &str) -> Decompositions<Chars<'_>> {
|
||||
Decompositions {
|
||||
iter: s.chars().fuse(),
|
||||
buffer: Buffer::new(),
|
||||
}
|
||||
}
|
||||
|
||||
struct Buffer {
|
||||
buffer: TinyVec<[(u8, char); 4]>,
|
||||
ready: Range<usize>,
|
||||
}
|
||||
|
||||
impl Buffer {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
buffer: TinyVec::new(),
|
||||
ready: 0..0,
|
||||
}
|
||||
}
|
||||
|
||||
fn push_back(&mut self, ch: char) {
|
||||
let class = table::lookup(ch).combining_class();
|
||||
|
||||
if class == 0 {
|
||||
self.sort_pending();
|
||||
|
||||
self.buffer.push((class, ch));
|
||||
self.ready.end = self.buffer.len();
|
||||
} else {
|
||||
self.buffer.push((class, ch));
|
||||
}
|
||||
}
|
||||
|
||||
fn sort_pending(&mut self) {
|
||||
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
|
||||
}
|
||||
|
||||
fn reset(&mut self) {
|
||||
let pending = self.buffer.len() - self.ready.end;
|
||||
|
||||
for i in 0..pending {
|
||||
self.buffer[i] = self.buffer[i + self.ready.end];
|
||||
}
|
||||
|
||||
self.buffer.truncate(pending);
|
||||
self.ready = 0..0;
|
||||
}
|
||||
|
||||
fn increment_next_ready(&mut self) {
|
||||
let next = self.ready.start + 1;
|
||||
|
||||
if next == self.ready.end {
|
||||
self.reset();
|
||||
} else {
|
||||
self.ready.start = next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Decompositions<I> {
|
||||
iter: Fuse<I>,
|
||||
buffer: Buffer,
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
|
||||
type Item = char;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
while self.buffer.ready.end == 0 {
|
||||
match self.iter.next() {
|
||||
Some(ch) => {
|
||||
decompose(ch, &mut self.buffer);
|
||||
}
|
||||
None => {
|
||||
if self.buffer.buffer.is_empty() {
|
||||
return None;
|
||||
} else {
|
||||
self.buffer.sort_pending();
|
||||
self.buffer.ready.end = self.buffer.buffer.len();
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let (_, ch) = self.buffer.buffer[self.buffer.ready.start];
|
||||
|
||||
self.buffer.increment_next_ready();
|
||||
|
||||
Some(ch)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let (lower, _) = self.iter.size_hint();
|
||||
|
||||
(lower, None)
|
||||
}
|
||||
}
|
||||
|
||||
// Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
|
||||
// http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
|
||||
const S_BASE: u32 = 0xAC00;
|
||||
const L_BASE: u32 = 0x1100;
|
||||
const V_BASE: u32 = 0x1161;
|
||||
const T_BASE: u32 = 0x11A7;
|
||||
const L_COUNT: u32 = 19;
|
||||
const V_COUNT: u32 = 21;
|
||||
const T_COUNT: u32 = 28;
|
||||
const N_COUNT: u32 = V_COUNT * T_COUNT;
|
||||
const S_COUNT: u32 = L_COUNT * N_COUNT;
|
||||
|
||||
fn decompose(c: char, buffer: &mut Buffer) {
|
||||
// 7-bit ASCII never decomposes
|
||||
if c <= '\x7f' {
|
||||
buffer.push_back(c);
|
||||
return;
|
||||
}
|
||||
|
||||
// Perform decomposition for Hangul
|
||||
if is_hangul_syllable(c) {
|
||||
decompose_hangul(c, buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
if let Some(decomposed) = table::lookup(c).decomposition() {
|
||||
match decomposed {
|
||||
Decomposition::Single(f) => {
|
||||
decompose(f, buffer);
|
||||
}
|
||||
Decomposition::Double(f, s) => {
|
||||
decompose(f, buffer);
|
||||
decompose(s, buffer);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Finally bottom out.
|
||||
buffer.push_back(c);
|
||||
}
|
||||
|
||||
fn is_hangul_syllable(c: char) -> bool {
|
||||
(c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT)
|
||||
}
|
||||
|
||||
#[allow(unsafe_code)]
|
||||
fn decompose_hangul(s: char, buffer: &mut Buffer) {
|
||||
let s_index = s as u32 - S_BASE;
|
||||
let l_index = s_index / N_COUNT;
|
||||
|
||||
unsafe {
|
||||
buffer.push_back(char::from_u32_unchecked(L_BASE + l_index));
|
||||
|
||||
let v_index = (s_index % N_COUNT) / T_COUNT;
|
||||
buffer.push_back(char::from_u32_unchecked(V_BASE + v_index));
|
||||
|
||||
let t_index = s_index % T_COUNT;
|
||||
if t_index > 0 {
|
||||
buffer.push_back(char::from_u32_unchecked(T_BASE + t_index));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_unicode_normalization() {
|
||||
let data = File::open("data/NormalizationTest.txt")
|
||||
.map(BufReader::new)
|
||||
.expect("unicode normalization test file");
|
||||
|
||||
#[derive(Default)]
|
||||
struct Entry {
|
||||
source: String,
|
||||
nfc: String,
|
||||
nfd: String,
|
||||
nfkc: String,
|
||||
nfkd: String,
|
||||
comment: String,
|
||||
}
|
||||
|
||||
for (i, line) in data.lines().enumerate() {
|
||||
let line = line.expect("line");
|
||||
|
||||
if line.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if line.starts_with(['#', '@']) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let entry =
|
||||
line.splitn(6, ';')
|
||||
.enumerate()
|
||||
.fold(Entry::default(), |mut entry, (i, string)| {
|
||||
match i {
|
||||
0 => {
|
||||
entry.source = string
|
||||
.split(' ')
|
||||
.map(|v| {
|
||||
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
|
||||
})
|
||||
.map(|v| char::from_u32(v).expect("valid char"))
|
||||
.collect::<String>()
|
||||
}
|
||||
1 => {
|
||||
entry.nfc = string
|
||||
.split(' ')
|
||||
.map(|v| {
|
||||
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
|
||||
})
|
||||
.map(|v| char::from_u32(v).expect("valid char"))
|
||||
.collect::<String>()
|
||||
}
|
||||
2 => {
|
||||
entry.nfd = string
|
||||
.split(' ')
|
||||
.map(|v| {
|
||||
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
|
||||
})
|
||||
.map(|v| char::from_u32(v).expect("valid char"))
|
||||
.collect::<String>()
|
||||
}
|
||||
3 => {
|
||||
entry.nfkc = string
|
||||
.split(' ')
|
||||
.map(|v| {
|
||||
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
|
||||
})
|
||||
.map(|v| char::from_u32(v).expect("valid char"))
|
||||
.collect::<String>()
|
||||
}
|
||||
4 => {
|
||||
entry.nfkd = string
|
||||
.split(' ')
|
||||
.map(|v| {
|
||||
u32::from_str_radix(v, 16).expect("valid u32 value as hex")
|
||||
})
|
||||
.map(|v| char::from_u32(v).expect("valid char"))
|
||||
.collect::<String>()
|
||||
}
|
||||
5 => {
|
||||
entry.comment =
|
||||
string.trim_start_matches(['#', ' ']).trim_end().to_string()
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
entry
|
||||
});
|
||||
|
||||
// c3 == toNFD(c1) == toNFD(c2) == toNFD(c3)
|
||||
similar_asserts::assert_str_eq!(
|
||||
nfd(&entry.source).collect::<String>(),
|
||||
entry.nfd,
|
||||
"c3 == toNFD(c1) at line {} # {}",
|
||||
i + 1,
|
||||
entry.comment
|
||||
);
|
||||
similar_asserts::assert_str_eq!(
|
||||
nfd(&entry.nfc).collect::<String>(),
|
||||
entry.nfd,
|
||||
"c3 == toNFD(c2) at line {} # {}",
|
||||
i + 1,
|
||||
entry.comment
|
||||
);
|
||||
similar_asserts::assert_str_eq!(
|
||||
nfd(&entry.nfd).collect::<String>(),
|
||||
entry.nfd,
|
||||
"c3 == toNFD(c3) at line {} # {}",
|
||||
i + 1,
|
||||
entry.comment
|
||||
);
|
||||
|
||||
// c5 == toNFD(c4) == toNFD(c5)
|
||||
similar_asserts::assert_str_eq!(
|
||||
nfd(&entry.nfkc).collect::<String>(),
|
||||
entry.nfkd,
|
||||
"c5 == toNFD(c4) at line {} # {}",
|
||||
i + 1,
|
||||
entry.comment
|
||||
);
|
||||
similar_asserts::assert_str_eq!(
|
||||
nfd(&entry.nfkd).collect::<String>(),
|
||||
entry.nfkd,
|
||||
"c5 == toNFD(c5) at line {} # {}",
|
||||
i + 1,
|
||||
entry.comment
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
114
crates/unf/src/table.rs
Normal file
114
crates/unf/src/table.rs
Normal file
@@ -0,0 +1,114 @@
|
||||
use ufst::raw::Fst;
|
||||
|
||||
const TABLE: Fst<&'static [u8]> =
|
||||
Fst::new_unchecked(include_bytes!(concat!(env!("OUT_DIR"), "/table.fst")));
|
||||
|
||||
pub fn lookup(ch: char) -> Entry {
|
||||
Entry::new(
|
||||
TABLE
|
||||
.get((ch as u32).to_ne_bytes())
|
||||
.map(|output| output.value())
|
||||
.unwrap_or(0),
|
||||
)
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Debug)]
|
||||
pub enum Decomposition {
|
||||
Single(char),
|
||||
Double(char, char),
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Debug)]
|
||||
pub struct Entry(u64);
|
||||
|
||||
impl Entry {
|
||||
pub(crate) fn new(data: u64) -> Self {
|
||||
Self(data)
|
||||
}
|
||||
|
||||
pub fn combining_class(&self) -> u8 {
|
||||
(self.0 & 0xFF) as u8
|
||||
}
|
||||
|
||||
pub(crate) fn decomposition(&self) -> Option<Decomposition> {
|
||||
let m1 = ((self.0 >> 8) & 0x1FFFFF) as u32;
|
||||
|
||||
if m1 > 0 {
|
||||
let m2 = ((self.0 >> 29) & 0x1FFFFF) as u32;
|
||||
|
||||
if m2 > 0 {
|
||||
unsafe {
|
||||
Some(Decomposition::Double(
|
||||
char::from_u32_unchecked(m1),
|
||||
char::from_u32_unchecked(m2),
|
||||
))
|
||||
}
|
||||
} else {
|
||||
unsafe { Some(Decomposition::Single(char::from_u32_unchecked(m1))) }
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use proptest::prelude::*;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn entry_strategy() -> impl Strategy<Value = (u64, (u8, u8, char, char))> {
|
||||
(
|
||||
any::<u8>(),
|
||||
(0u8..2),
|
||||
any::<char>().prop_filter("", |c| *c != '\u{0}'),
|
||||
any::<char>().prop_filter("", |c| *c != '\u{0}'),
|
||||
)
|
||||
.prop_map(
|
||||
|(combining_class, mapping_count, decomposition_first, decomposition_second)| {
|
||||
let mut entry = combining_class as u64;
|
||||
|
||||
if mapping_count > 0 {
|
||||
entry |= (decomposition_first as u64) << 8;
|
||||
}
|
||||
|
||||
if mapping_count > 1 {
|
||||
entry |= (decomposition_second as u64) << (21 + 8);
|
||||
}
|
||||
|
||||
(
|
||||
entry,
|
||||
(
|
||||
combining_class,
|
||||
mapping_count,
|
||||
decomposition_first,
|
||||
decomposition_second,
|
||||
),
|
||||
)
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn proptest_entry_serialize_and_deserialize(a in entry_strategy()) {
|
||||
let (data, (combining_class, mapping_count, decomposition_first, decomposition_second)) = a;
|
||||
|
||||
let b = Entry::new(data);
|
||||
|
||||
prop_assert_eq!(b.combining_class(), combining_class, "data = {:064b}", data);
|
||||
|
||||
match mapping_count {
|
||||
0 => prop_assert_eq!(b.decomposition(), None, "data = {:064b}", data),
|
||||
1 => prop_assert_eq!(b.decomposition(), Some(Decomposition::Single(decomposition_first)), "data = {:064b}", data),
|
||||
2 => prop_assert_eq!(b.decomposition(), Some(Decomposition::Double(decomposition_first, decomposition_second)), "data = {:064b}", data),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
|
||||
|
||||
// prop_assert_eq!(a, b, "data = {:064b}", data);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user