Check for for word boundaries.
This commit is contained in:
@@ -6,6 +6,7 @@ edition = "2018"
|
||||
build = "build.rs"
|
||||
|
||||
[dependencies]
|
||||
bstr = "0.2"
|
||||
fst = "0.4"
|
||||
isolang = "1.0"
|
||||
once_cell = "1.4"
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::str;
|
||||
|
||||
use bstr::ByteSlice;
|
||||
use fst::raw::{Fst, Output};
|
||||
pub use isolang::Language;
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -15,9 +16,14 @@ pub struct Match {
|
||||
pub end: usize,
|
||||
}
|
||||
|
||||
pub fn scan(input: impl AsRef<str>) -> Vec<Match> {
|
||||
pub fn scan(input: impl AsRef<[u8]>) -> Vec<Match> {
|
||||
let input = input.as_ref().to_lowercase();
|
||||
|
||||
let (start_indices, end_indices): (Vec<_>, Vec<_>) = input
|
||||
.word_indices()
|
||||
.map(|(start, end, _)| (start, end))
|
||||
.unzip();
|
||||
|
||||
let mut result = Vec::new();
|
||||
|
||||
let mut stack = vec![None; N];
|
||||
@@ -44,11 +50,13 @@ pub fn scan(input: impl AsRef<str>) -> Vec<Match> {
|
||||
let bytes = value.to_be_bytes();
|
||||
let code = str::from_utf8(&bytes[5..]).unwrap();
|
||||
|
||||
result.push(Match {
|
||||
language: Language::from_639_3(code).unwrap(),
|
||||
start: idx,
|
||||
end: idx + len,
|
||||
});
|
||||
if start_indices.contains(&idx) && end_indices.contains(&(idx + len)) {
|
||||
result.push(Match {
|
||||
language: Language::from_639_3(code).unwrap(),
|
||||
start: idx,
|
||||
end: idx + len,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
*entry = Some((node, out));
|
||||
@@ -77,4 +85,9 @@ mod tests {
|
||||
}]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bug_001() {
|
||||
assert_eq!(scan("ampersand"), vec![]);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user