Use fst instead of phf.
This commit is contained in:
@@ -6,9 +6,9 @@ edition = "2018"
|
||||
build = "build.rs"
|
||||
|
||||
[dependencies]
|
||||
fst = "0.4"
|
||||
isolang = "1.0"
|
||||
phf = "0.8"
|
||||
once_cell = "1.4"
|
||||
|
||||
[build-dependencies]
|
||||
isolang = "1.0"
|
||||
phf_codegen = "0.8"
|
||||
fst = "0.4"
|
||||
|
||||
@@ -1,43 +1,51 @@
|
||||
use std::env;
|
||||
use std::error::Error;
|
||||
use std::fs::File;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::{BufRead, BufReader, BufWriter, Write};
|
||||
use std::path::Path;
|
||||
|
||||
fn title(s: &str) -> String {
|
||||
let mut c = s.chars();
|
||||
match c.next() {
|
||||
None => String::new(),
|
||||
Some(f) => f.to_uppercase().chain(c).collect(),
|
||||
}
|
||||
}
|
||||
use fst::MapBuilder;
|
||||
|
||||
fn main() -> Result<(), Box<dyn Error>> {
|
||||
let mut map = phf_codegen::Map::new();
|
||||
|
||||
let lines = File::open("languages.txt")
|
||||
.map(BufReader::new)?
|
||||
.lines()
|
||||
.filter_map(Result::ok)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut entries = Vec::new();
|
||||
|
||||
let mut buf = [0; 8];
|
||||
|
||||
for line in &lines {
|
||||
let parts = line.splitn(2, '\t').collect::<Vec<_>>();
|
||||
|
||||
let name = parts[0];
|
||||
let language = parts[1];
|
||||
|
||||
map.entry(name, &format!("Language::{}", title(language)));
|
||||
buf[5..].copy_from_slice(language.as_bytes());
|
||||
let value = u64::from_be_bytes(buf);
|
||||
|
||||
entries.push((name, value));
|
||||
}
|
||||
|
||||
let path = Path::new(&env::var("OUT_DIR")?).join("languages.rs");
|
||||
let mut file = BufWriter::new(File::create(&path)?);
|
||||
entries.sort_unstable();
|
||||
entries.dedup();
|
||||
|
||||
write!(
|
||||
&mut file,
|
||||
"static LANGUAGES: phf::Map<&'static str, Language> = {};\n",
|
||||
map.build()
|
||||
)?;
|
||||
let mut build = MapBuilder::memory();
|
||||
build.extend_iter(entries)?;
|
||||
|
||||
let data = build.into_inner()?;
|
||||
let path = Path::new(&env::var("OUT_DIR")?).join("text.fst");
|
||||
|
||||
OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.open(path)
|
||||
.map(BufWriter::new)
|
||||
.and_then(|mut file| file.write_all(&data))
|
||||
.expect("failed to save invalid");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,13 +1,80 @@
|
||||
use isolang::Language;
|
||||
use std::str;
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/languages.rs"));
|
||||
use fst::raw::{Fst, Output};
|
||||
pub use isolang::Language;
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub trait LanguageExt {
|
||||
fn from_name(name: &str) -> Option<Language>;
|
||||
static TEXT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/text.fst"));
|
||||
static FST: Lazy<Fst<&'static [u8]>> = Lazy::new(|| Fst::new(TEXT).unwrap());
|
||||
const N: usize = 128;
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Debug)]
|
||||
pub struct Match {
|
||||
pub language: Language,
|
||||
pub start: usize,
|
||||
pub end: usize,
|
||||
}
|
||||
|
||||
impl LanguageExt for Language {
|
||||
fn from_name(name: &str) -> Option<Language> {
|
||||
LANGUAGES.get(name).cloned()
|
||||
pub fn scan(input: impl AsRef<str>) -> Vec<Match> {
|
||||
let input = input.as_ref().to_lowercase();
|
||||
|
||||
let mut result = Vec::new();
|
||||
|
||||
let mut stack = vec![None; N];
|
||||
let mut index = 0;
|
||||
|
||||
for (n, byte) in input.as_bytes().iter().enumerate() {
|
||||
index %= N;
|
||||
stack[index] = Some((FST.root(), Output::zero()));
|
||||
|
||||
for (m, entry) in stack.iter_mut().enumerate() {
|
||||
if let Some((mut node, mut out)) = entry.take() {
|
||||
if let Some(i) = node.find_input(*byte) {
|
||||
let t = node.transition(i);
|
||||
|
||||
out = out.cat(t.out);
|
||||
node = FST.node(t.addr);
|
||||
|
||||
if node.is_final() {
|
||||
let value = out.cat(node.final_output()).value();
|
||||
|
||||
let len = ((index as i32 - m as i32 + N as i32) as usize % N) + 1;
|
||||
let idx = (n + 1) - len;
|
||||
|
||||
let bytes = value.to_be_bytes();
|
||||
let code = str::from_utf8(&bytes[5..]).unwrap();
|
||||
|
||||
result.push(Match {
|
||||
language: Language::from_639_3(code).unwrap(),
|
||||
start: idx,
|
||||
end: idx + len,
|
||||
});
|
||||
}
|
||||
|
||||
*entry = Some((node, out));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
index += 1;
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_swedish() {
|
||||
assert_eq!(
|
||||
scan("swedish"),
|
||||
vec![Match {
|
||||
language: Language::Swe,
|
||||
start: 0,
|
||||
end: 7,
|
||||
}]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user