diff --git a/txtlang/Cargo.toml b/txtlang/Cargo.toml index 70e41f0..0f147b5 100644 --- a/txtlang/Cargo.toml +++ b/txtlang/Cargo.toml @@ -6,9 +6,9 @@ edition = "2018" build = "build.rs" [dependencies] +fst = "0.4" isolang = "1.0" -phf = "0.8" +once_cell = "1.4" [build-dependencies] -isolang = "1.0" -phf_codegen = "0.8" +fst = "0.4" diff --git a/txtlang/build.rs b/txtlang/build.rs index 659bf89..89d6a74 100644 --- a/txtlang/build.rs +++ b/txtlang/build.rs @@ -1,43 +1,51 @@ use std::env; use std::error::Error; -use std::fs::File; +use std::fs::{File, OpenOptions}; use std::io::{BufRead, BufReader, BufWriter, Write}; use std::path::Path; -fn title(s: &str) -> String { - let mut c = s.chars(); - match c.next() { - None => String::new(), - Some(f) => f.to_uppercase().chain(c).collect(), - } -} +use fst::MapBuilder; fn main() -> Result<(), Box> { - let mut map = phf_codegen::Map::new(); - let lines = File::open("languages.txt") .map(BufReader::new)? .lines() .filter_map(Result::ok) .collect::>(); + let mut entries = Vec::new(); + + let mut buf = [0; 8]; + for line in &lines { let parts = line.splitn(2, '\t').collect::>(); let name = parts[0]; let language = parts[1]; - map.entry(name, &format!("Language::{}", title(language))); + buf[5..].copy_from_slice(language.as_bytes()); + let value = u64::from_be_bytes(buf); + + entries.push((name, value)); } - let path = Path::new(&env::var("OUT_DIR")?).join("languages.rs"); - let mut file = BufWriter::new(File::create(&path)?); + entries.sort_unstable(); + entries.dedup(); - write!( - &mut file, - "static LANGUAGES: phf::Map<&'static str, Language> = {};\n", - map.build() - )?; + let mut build = MapBuilder::memory(); + build.extend_iter(entries)?; + + let data = build.into_inner()?; + let path = Path::new(&env::var("OUT_DIR")?).join("text.fst"); + + OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(path) + .map(BufWriter::new) + .and_then(|mut file| file.write_all(&data)) + .expect("failed to save invalid"); Ok(()) } diff --git a/txtlang/src/lib.rs b/txtlang/src/lib.rs index 076731d..74048a8 100644 --- a/txtlang/src/lib.rs +++ b/txtlang/src/lib.rs @@ -1,13 +1,80 @@ -use isolang::Language; +use std::str; -include!(concat!(env!("OUT_DIR"), "/languages.rs")); +use fst::raw::{Fst, Output}; +pub use isolang::Language; +use once_cell::sync::Lazy; -pub trait LanguageExt { - fn from_name(name: &str) -> Option; +static TEXT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/text.fst")); +static FST: Lazy> = Lazy::new(|| Fst::new(TEXT).unwrap()); +const N: usize = 128; + +#[derive(Copy, Clone, PartialEq, Debug)] +pub struct Match { + pub language: Language, + pub start: usize, + pub end: usize, } -impl LanguageExt for Language { - fn from_name(name: &str) -> Option { - LANGUAGES.get(name).cloned() +pub fn scan(input: impl AsRef) -> Vec { + let input = input.as_ref().to_lowercase(); + + let mut result = Vec::new(); + + let mut stack = vec![None; N]; + let mut index = 0; + + for (n, byte) in input.as_bytes().iter().enumerate() { + index %= N; + stack[index] = Some((FST.root(), Output::zero())); + + for (m, entry) in stack.iter_mut().enumerate() { + if let Some((mut node, mut out)) = entry.take() { + if let Some(i) = node.find_input(*byte) { + let t = node.transition(i); + + out = out.cat(t.out); + node = FST.node(t.addr); + + if node.is_final() { + let value = out.cat(node.final_output()).value(); + + let len = ((index as i32 - m as i32 + N as i32) as usize % N) + 1; + let idx = (n + 1) - len; + + let bytes = value.to_be_bytes(); + let code = str::from_utf8(&bytes[5..]).unwrap(); + + result.push(Match { + language: Language::from_639_3(code).unwrap(), + start: idx, + end: idx + len, + }); + } + + *entry = Some((node, out)); + } + } + } + + index += 1; + } + + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_swedish() { + assert_eq!( + scan("swedish"), + vec![Match { + language: Language::Swe, + start: 0, + end: 7, + }] + ); } }