Use fst instead of phf.

This commit is contained in:
2020-09-24 20:20:23 +02:00
parent adc16c9e14
commit fdbdccc47c
3 changed files with 103 additions and 28 deletions

View File

@@ -6,9 +6,9 @@ edition = "2018"
build = "build.rs" build = "build.rs"
[dependencies] [dependencies]
fst = "0.4"
isolang = "1.0" isolang = "1.0"
phf = "0.8" once_cell = "1.4"
[build-dependencies] [build-dependencies]
isolang = "1.0" fst = "0.4"
phf_codegen = "0.8"

View File

@@ -1,43 +1,51 @@
use std::env; use std::env;
use std::error::Error; use std::error::Error;
use std::fs::File; use std::fs::{File, OpenOptions};
use std::io::{BufRead, BufReader, BufWriter, Write}; use std::io::{BufRead, BufReader, BufWriter, Write};
use std::path::Path; use std::path::Path;
fn title(s: &str) -> String { use fst::MapBuilder;
let mut c = s.chars();
match c.next() {
None => String::new(),
Some(f) => f.to_uppercase().chain(c).collect(),
}
}
fn main() -> Result<(), Box<dyn Error>> { fn main() -> Result<(), Box<dyn Error>> {
let mut map = phf_codegen::Map::new();
let lines = File::open("languages.txt") let lines = File::open("languages.txt")
.map(BufReader::new)? .map(BufReader::new)?
.lines() .lines()
.filter_map(Result::ok) .filter_map(Result::ok)
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let mut entries = Vec::new();
let mut buf = [0; 8];
for line in &lines { for line in &lines {
let parts = line.splitn(2, '\t').collect::<Vec<_>>(); let parts = line.splitn(2, '\t').collect::<Vec<_>>();
let name = parts[0]; let name = parts[0];
let language = parts[1]; let language = parts[1];
map.entry(name, &format!("Language::{}", title(language))); buf[5..].copy_from_slice(language.as_bytes());
let value = u64::from_be_bytes(buf);
entries.push((name, value));
} }
let path = Path::new(&env::var("OUT_DIR")?).join("languages.rs"); entries.sort_unstable();
let mut file = BufWriter::new(File::create(&path)?); entries.dedup();
write!( let mut build = MapBuilder::memory();
&mut file, build.extend_iter(entries)?;
"static LANGUAGES: phf::Map<&'static str, Language> = {};\n",
map.build() let data = build.into_inner()?;
)?; let path = Path::new(&env::var("OUT_DIR")?).join("text.fst");
OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.open(path)
.map(BufWriter::new)
.and_then(|mut file| file.write_all(&data))
.expect("failed to save invalid");
Ok(()) Ok(())
} }

View File

@@ -1,13 +1,80 @@
use isolang::Language; use std::str;
include!(concat!(env!("OUT_DIR"), "/languages.rs")); use fst::raw::{Fst, Output};
pub use isolang::Language;
use once_cell::sync::Lazy;
pub trait LanguageExt { static TEXT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/text.fst"));
fn from_name(name: &str) -> Option<Language>; static FST: Lazy<Fst<&'static [u8]>> = Lazy::new(|| Fst::new(TEXT).unwrap());
const N: usize = 128;
#[derive(Copy, Clone, PartialEq, Debug)]
pub struct Match {
pub language: Language,
pub start: usize,
pub end: usize,
} }
impl LanguageExt for Language { pub fn scan(input: impl AsRef<str>) -> Vec<Match> {
fn from_name(name: &str) -> Option<Language> { let input = input.as_ref().to_lowercase();
LANGUAGES.get(name).cloned()
let mut result = Vec::new();
let mut stack = vec![None; N];
let mut index = 0;
for (n, byte) in input.as_bytes().iter().enumerate() {
index %= N;
stack[index] = Some((FST.root(), Output::zero()));
for (m, entry) in stack.iter_mut().enumerate() {
if let Some((mut node, mut out)) = entry.take() {
if let Some(i) = node.find_input(*byte) {
let t = node.transition(i);
out = out.cat(t.out);
node = FST.node(t.addr);
if node.is_final() {
let value = out.cat(node.final_output()).value();
let len = ((index as i32 - m as i32 + N as i32) as usize % N) + 1;
let idx = (n + 1) - len;
let bytes = value.to_be_bytes();
let code = str::from_utf8(&bytes[5..]).unwrap();
result.push(Match {
language: Language::from_639_3(code).unwrap(),
start: idx,
end: idx + len,
});
}
*entry = Some((node, out));
}
}
}
index += 1;
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_swedish() {
assert_eq!(
scan("swedish"),
vec![Match {
language: Language::Swe,
start: 0,
end: 7,
}]
);
} }
} }