diff --git a/txtlang/Cargo.toml b/txtlang/Cargo.toml index 0f147b5..f7b2fbe 100644 --- a/txtlang/Cargo.toml +++ b/txtlang/Cargo.toml @@ -6,6 +6,7 @@ edition = "2018" build = "build.rs" [dependencies] +bstr = "0.2" fst = "0.4" isolang = "1.0" once_cell = "1.4" diff --git a/txtlang/src/lib.rs b/txtlang/src/lib.rs index 74048a8..cde95b4 100644 --- a/txtlang/src/lib.rs +++ b/txtlang/src/lib.rs @@ -1,5 +1,6 @@ use std::str; +use bstr::ByteSlice; use fst::raw::{Fst, Output}; pub use isolang::Language; use once_cell::sync::Lazy; @@ -15,9 +16,14 @@ pub struct Match { pub end: usize, } -pub fn scan(input: impl AsRef) -> Vec { +pub fn scan(input: impl AsRef<[u8]>) -> Vec { let input = input.as_ref().to_lowercase(); + let (start_indices, end_indices): (Vec<_>, Vec<_>) = input + .word_indices() + .map(|(start, end, _)| (start, end)) + .unzip(); + let mut result = Vec::new(); let mut stack = vec![None; N]; @@ -44,11 +50,13 @@ pub fn scan(input: impl AsRef) -> Vec { let bytes = value.to_be_bytes(); let code = str::from_utf8(&bytes[5..]).unwrap(); - result.push(Match { - language: Language::from_639_3(code).unwrap(), - start: idx, - end: idx + len, - }); + if start_indices.contains(&idx) && end_indices.contains(&(idx + len)) { + result.push(Match { + language: Language::from_639_3(code).unwrap(), + start: idx, + end: idx + len, + }); + } } *entry = Some((node, out)); @@ -77,4 +85,9 @@ mod tests { }] ); } + + #[test] + fn test_bug_001() { + assert_eq!(scan("ampersand"), vec![]); + } }