Use fst instead of phf.

This commit is contained in:
2020-09-24 20:20:23 +02:00
parent adc16c9e14
commit fdbdccc47c
3 changed files with 103 additions and 28 deletions

View File

@@ -6,9 +6,9 @@ edition = "2018"
build = "build.rs"
[dependencies]
fst = "0.4"
isolang = "1.0"
phf = "0.8"
once_cell = "1.4"
[build-dependencies]
isolang = "1.0"
phf_codegen = "0.8"
fst = "0.4"

View File

@@ -1,43 +1,51 @@
use std::env;
use std::error::Error;
use std::fs::File;
use std::fs::{File, OpenOptions};
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::path::Path;
fn title(s: &str) -> String {
let mut c = s.chars();
match c.next() {
None => String::new(),
Some(f) => f.to_uppercase().chain(c).collect(),
}
}
use fst::MapBuilder;
fn main() -> Result<(), Box<dyn Error>> {
let mut map = phf_codegen::Map::new();
let lines = File::open("languages.txt")
.map(BufReader::new)?
.lines()
.filter_map(Result::ok)
.collect::<Vec<_>>();
let mut entries = Vec::new();
let mut buf = [0; 8];
for line in &lines {
let parts = line.splitn(2, '\t').collect::<Vec<_>>();
let name = parts[0];
let language = parts[1];
map.entry(name, &format!("Language::{}", title(language)));
buf[5..].copy_from_slice(language.as_bytes());
let value = u64::from_be_bytes(buf);
entries.push((name, value));
}
let path = Path::new(&env::var("OUT_DIR")?).join("languages.rs");
let mut file = BufWriter::new(File::create(&path)?);
entries.sort_unstable();
entries.dedup();
write!(
&mut file,
"static LANGUAGES: phf::Map<&'static str, Language> = {};\n",
map.build()
)?;
let mut build = MapBuilder::memory();
build.extend_iter(entries)?;
let data = build.into_inner()?;
let path = Path::new(&env::var("OUT_DIR")?).join("text.fst");
OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.open(path)
.map(BufWriter::new)
.and_then(|mut file| file.write_all(&data))
.expect("failed to save invalid");
Ok(())
}

View File

@@ -1,13 +1,80 @@
use isolang::Language;
use std::str;
include!(concat!(env!("OUT_DIR"), "/languages.rs"));
use fst::raw::{Fst, Output};
pub use isolang::Language;
use once_cell::sync::Lazy;
pub trait LanguageExt {
fn from_name(name: &str) -> Option<Language>;
static TEXT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/text.fst"));
static FST: Lazy<Fst<&'static [u8]>> = Lazy::new(|| Fst::new(TEXT).unwrap());
const N: usize = 128;
#[derive(Copy, Clone, PartialEq, Debug)]
pub struct Match {
pub language: Language,
pub start: usize,
pub end: usize,
}
impl LanguageExt for Language {
fn from_name(name: &str) -> Option<Language> {
LANGUAGES.get(name).cloned()
pub fn scan(input: impl AsRef<str>) -> Vec<Match> {
let input = input.as_ref().to_lowercase();
let mut result = Vec::new();
let mut stack = vec![None; N];
let mut index = 0;
for (n, byte) in input.as_bytes().iter().enumerate() {
index %= N;
stack[index] = Some((FST.root(), Output::zero()));
for (m, entry) in stack.iter_mut().enumerate() {
if let Some((mut node, mut out)) = entry.take() {
if let Some(i) = node.find_input(*byte) {
let t = node.transition(i);
out = out.cat(t.out);
node = FST.node(t.addr);
if node.is_final() {
let value = out.cat(node.final_output()).value();
let len = ((index as i32 - m as i32 + N as i32) as usize % N) + 1;
let idx = (n + 1) - len;
let bytes = value.to_be_bytes();
let code = str::from_utf8(&bytes[5..]).unwrap();
result.push(Match {
language: Language::from_639_3(code).unwrap(),
start: idx,
end: idx + len,
});
}
*entry = Some((node, out));
}
}
}
index += 1;
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_swedish() {
assert_eq!(
scan("swedish"),
vec![Match {
language: Language::Swe,
start: 0,
end: 7,
}]
);
}
}