Check for for word boundaries.

This commit is contained in:
2020-09-25 07:35:22 +02:00
parent fdbdccc47c
commit 354350f734
2 changed files with 20 additions and 6 deletions

View File

@@ -6,6 +6,7 @@ edition = "2018"
build = "build.rs" build = "build.rs"
[dependencies] [dependencies]
bstr = "0.2"
fst = "0.4" fst = "0.4"
isolang = "1.0" isolang = "1.0"
once_cell = "1.4" once_cell = "1.4"

View File

@@ -1,5 +1,6 @@
use std::str; use std::str;
use bstr::ByteSlice;
use fst::raw::{Fst, Output}; use fst::raw::{Fst, Output};
pub use isolang::Language; pub use isolang::Language;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
@@ -15,9 +16,14 @@ pub struct Match {
pub end: usize, pub end: usize,
} }
pub fn scan(input: impl AsRef<str>) -> Vec<Match> { pub fn scan(input: impl AsRef<[u8]>) -> Vec<Match> {
let input = input.as_ref().to_lowercase(); let input = input.as_ref().to_lowercase();
let (start_indices, end_indices): (Vec<_>, Vec<_>) = input
.word_indices()
.map(|(start, end, _)| (start, end))
.unzip();
let mut result = Vec::new(); let mut result = Vec::new();
let mut stack = vec![None; N]; let mut stack = vec![None; N];
@@ -44,12 +50,14 @@ pub fn scan(input: impl AsRef<str>) -> Vec<Match> {
let bytes = value.to_be_bytes(); let bytes = value.to_be_bytes();
let code = str::from_utf8(&bytes[5..]).unwrap(); let code = str::from_utf8(&bytes[5..]).unwrap();
if start_indices.contains(&idx) && end_indices.contains(&(idx + len)) {
result.push(Match { result.push(Match {
language: Language::from_639_3(code).unwrap(), language: Language::from_639_3(code).unwrap(),
start: idx, start: idx,
end: idx + len, end: idx + len,
}); });
} }
}
*entry = Some((node, out)); *entry = Some((node, out));
} }
@@ -77,4 +85,9 @@ mod tests {
}] }]
); );
} }
#[test]
fn test_bug_001() {
assert_eq!(scan("ampersand"), vec![]);
}
} }