Check for for word boundaries.

This commit is contained in:
2020-09-25 07:35:22 +02:00
parent fdbdccc47c
commit 354350f734
2 changed files with 20 additions and 6 deletions

View File

@@ -6,6 +6,7 @@ edition = "2018"
build = "build.rs"
[dependencies]
bstr = "0.2"
fst = "0.4"
isolang = "1.0"
once_cell = "1.4"

View File

@@ -1,5 +1,6 @@
use std::str;
use bstr::ByteSlice;
use fst::raw::{Fst, Output};
pub use isolang::Language;
use once_cell::sync::Lazy;
@@ -15,9 +16,14 @@ pub struct Match {
pub end: usize,
}
pub fn scan(input: impl AsRef<str>) -> Vec<Match> {
pub fn scan(input: impl AsRef<[u8]>) -> Vec<Match> {
let input = input.as_ref().to_lowercase();
let (start_indices, end_indices): (Vec<_>, Vec<_>) = input
.word_indices()
.map(|(start, end, _)| (start, end))
.unzip();
let mut result = Vec::new();
let mut stack = vec![None; N];
@@ -44,11 +50,13 @@ pub fn scan(input: impl AsRef<str>) -> Vec<Match> {
let bytes = value.to_be_bytes();
let code = str::from_utf8(&bytes[5..]).unwrap();
result.push(Match {
language: Language::from_639_3(code).unwrap(),
start: idx,
end: idx + len,
});
if start_indices.contains(&idx) && end_indices.contains(&(idx + len)) {
result.push(Match {
language: Language::from_639_3(code).unwrap(),
start: idx,
end: idx + len,
});
}
}
*entry = Some((node, out));
@@ -77,4 +85,9 @@ mod tests {
}]
);
}
#[test]
fn test_bug_001() {
assert_eq!(scan("ampersand"), vec![]);
}
}