Check for for word boundaries.
This commit is contained in:
@@ -6,6 +6,7 @@ edition = "2018"
|
|||||||
build = "build.rs"
|
build = "build.rs"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
bstr = "0.2"
|
||||||
fst = "0.4"
|
fst = "0.4"
|
||||||
isolang = "1.0"
|
isolang = "1.0"
|
||||||
once_cell = "1.4"
|
once_cell = "1.4"
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
use std::str;
|
use std::str;
|
||||||
|
|
||||||
|
use bstr::ByteSlice;
|
||||||
use fst::raw::{Fst, Output};
|
use fst::raw::{Fst, Output};
|
||||||
pub use isolang::Language;
|
pub use isolang::Language;
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
@@ -15,9 +16,14 @@ pub struct Match {
|
|||||||
pub end: usize,
|
pub end: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn scan(input: impl AsRef<str>) -> Vec<Match> {
|
pub fn scan(input: impl AsRef<[u8]>) -> Vec<Match> {
|
||||||
let input = input.as_ref().to_lowercase();
|
let input = input.as_ref().to_lowercase();
|
||||||
|
|
||||||
|
let (start_indices, end_indices): (Vec<_>, Vec<_>) = input
|
||||||
|
.word_indices()
|
||||||
|
.map(|(start, end, _)| (start, end))
|
||||||
|
.unzip();
|
||||||
|
|
||||||
let mut result = Vec::new();
|
let mut result = Vec::new();
|
||||||
|
|
||||||
let mut stack = vec![None; N];
|
let mut stack = vec![None; N];
|
||||||
@@ -44,11 +50,13 @@ pub fn scan(input: impl AsRef<str>) -> Vec<Match> {
|
|||||||
let bytes = value.to_be_bytes();
|
let bytes = value.to_be_bytes();
|
||||||
let code = str::from_utf8(&bytes[5..]).unwrap();
|
let code = str::from_utf8(&bytes[5..]).unwrap();
|
||||||
|
|
||||||
result.push(Match {
|
if start_indices.contains(&idx) && end_indices.contains(&(idx + len)) {
|
||||||
language: Language::from_639_3(code).unwrap(),
|
result.push(Match {
|
||||||
start: idx,
|
language: Language::from_639_3(code).unwrap(),
|
||||||
end: idx + len,
|
start: idx,
|
||||||
});
|
end: idx + len,
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
*entry = Some((node, out));
|
*entry = Some((node, out));
|
||||||
@@ -77,4 +85,9 @@ mod tests {
|
|||||||
}]
|
}]
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_bug_001() {
|
||||||
|
assert_eq!(scan("ampersand"), vec![]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user