Rename to Ngram and Ngrams.

This commit is contained in:
2018-01-12 13:52:31 +01:00
parent 0c8a017152
commit 30852d02ee
2 changed files with 63 additions and 60 deletions

View File

@@ -12,8 +12,12 @@ fn criterion_benchmark(c: &mut Criterion) {
let len = from_slice(&data[..]).len(); let len = from_slice(&data[..]).len();
c.bench_function("from_slice", |b| b.iter(|| assert_eq!(len, from_slice(&data[..]).len()))); c.bench_function("from_slice", |b| {
c.bench_function("ByteNgramReader", |b| b.iter(|| assert_eq!(len, ByteNgramReader::new(&data).count()))); b.iter(|| assert_eq!(len, from_slice(&data[..]).len()))
});
c.bench_function("ByteNgramReader", |b| {
b.iter(|| assert_eq!(len, Ngrams::from(&data).count()))
});
} }
criterion_group!(benches, criterion_benchmark); criterion_group!(benches, criterion_benchmark);

View File

@@ -8,10 +8,10 @@ use std::convert;
use std::slice::Iter; use std::slice::Iter;
#[derive(Hash, PartialEq, Eq, Clone, Copy, PartialOrd, Ord, Serialize, Deserialize)] #[derive(Hash, PartialEq, Eq, Clone, Copy, PartialOrd, Ord, Serialize, Deserialize)]
pub struct ByteNgram(u64); pub struct Ngram(u64);
impl ByteNgram { impl Ngram {
pub fn from_slice<T>(data: T) -> ByteNgram pub fn from_slice<T>(data: T) -> Ngram
where where
T: AsRef<[u8]>, T: AsRef<[u8]>,
{ {
@@ -26,7 +26,7 @@ impl ByteNgram {
token += data.as_ref().len() as u64; token += data.as_ref().len() as u64;
ByteNgram(token) Ngram(token)
} }
pub fn dim(&self) -> usize { pub fn dim(&self) -> usize {
@@ -45,8 +45,8 @@ impl ByteNgram {
bytes bytes
} }
pub fn lhs(&self) -> ByteNgram { pub fn lhs(&self) -> Ngram {
ByteNgram(match self.dim() { Ngram(match self.dim() {
2 => (self.0 >> 8 & 0x0000_0000_0000_FF00) + (self.dim() - 1) as u64, 2 => (self.0 >> 8 & 0x0000_0000_0000_FF00) + (self.dim() - 1) as u64,
3 => (self.0 >> 8 & 0x0000_0000_00FF_FF00) + (self.dim() - 1) as u64, 3 => (self.0 >> 8 & 0x0000_0000_00FF_FF00) + (self.dim() - 1) as u64,
4 => (self.0 >> 8 & 0x0000_0000_FFFF_FF00) + (self.dim() - 1) as u64, 4 => (self.0 >> 8 & 0x0000_0000_FFFF_FF00) + (self.dim() - 1) as u64,
@@ -57,8 +57,8 @@ impl ByteNgram {
}) })
} }
pub fn rhs(&self) -> ByteNgram { pub fn rhs(&self) -> Ngram {
ByteNgram(match self.dim() { Ngram(match self.dim() {
2 => (self.0 & 0x0000_0000_0000_FF00) + (self.dim() - 1) as u64, 2 => (self.0 & 0x0000_0000_0000_FF00) + (self.dim() - 1) as u64,
3 => (self.0 & 0x0000_0000_00FF_FF00) + (self.dim() - 1) as u64, 3 => (self.0 & 0x0000_0000_00FF_FF00) + (self.dim() - 1) as u64,
4 => (self.0 & 0x0000_0000_FFFF_FF00) + (self.dim() - 1) as u64, 4 => (self.0 & 0x0000_0000_FFFF_FF00) + (self.dim() - 1) as u64,
@@ -70,13 +70,13 @@ impl ByteNgram {
} }
} }
impl fmt::Debug for ByteNgram { impl fmt::Debug for Ngram {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{:?}", self.bytes()) write!(f, "{:?}", self.bytes())
} }
} }
impl ops::Deref for ByteNgram { impl ops::Deref for Ngram {
type Target = u64; type Target = u64;
fn deref(&self) -> &Self::Target { fn deref(&self) -> &Self::Target {
@@ -84,24 +84,26 @@ impl ops::Deref for ByteNgram {
} }
} }
impl convert::From<u64> for ByteNgram { impl convert::From<u64> for Ngram {
fn from(value: u64) -> Self { fn from(value: u64) -> Self {
ByteNgram(value) Ngram(value)
} }
} }
pub struct ByteNgramReader<'a> { pub struct Ngrams<'a> {
inner: Iter<'a, u8>, inner: Iter<'a, u8>,
count: u64, count: u64,
token: u64, token: u64,
mask: u64, mask: u64,
} }
impl<'a> ByteNgramReader<'a> { impl<'a, T> convert::From<&'a T> for Ngrams<'a>
#[inline] where
pub fn new<T: 'a>(inner: &'a T) -> Self where T: AsRef<[u8]> { T: AsRef<[u8]>,
ByteNgramReader { {
inner: inner.as_ref().iter(), fn from(src: &'a T) -> Ngrams<'a> {
Ngrams {
inner: src.as_ref().iter(),
count: 0, count: 0,
token: 0, token: 0,
mask: 0, mask: 0,
@@ -109,8 +111,8 @@ impl<'a> ByteNgramReader<'a> {
} }
} }
impl<'a> Iterator for ByteNgramReader<'a> { impl<'a> Iterator for Ngrams<'a> {
type Item = ByteNgram; type Item = Ngram;
#[inline] #[inline]
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
@@ -131,7 +133,7 @@ impl<'a> Iterator for ByteNgramReader<'a> {
} }
} }
let token = ByteNgram((self.token & self.mask & !0xFF) + (self.mask & 0xFF)); let token = Ngram((self.token & self.mask & !0xFF) + (self.mask & 0xFF));
self.mask = ((self.mask >> 8) & !0xFF) + (self.mask & 0xFF) - 1; self.mask = ((self.mask >> 8) & !0xFF) + (self.mask & 0xFF) - 1;
@@ -139,7 +141,7 @@ impl<'a> Iterator for ByteNgramReader<'a> {
} }
} }
pub fn from_slice<T>(data: T) -> Vec<ByteNgram> pub fn from_slice<T>(data: T) -> Vec<Ngram>
where where
T: AsRef<[u8]>, T: AsRef<[u8]>,
{ {
@@ -149,31 +151,31 @@ where
for i in 0..buffer.len() { for i in 0..buffer.len() {
{ {
tokens.push(ByteNgram::from_slice(&buffer[i..i + 1])); tokens.push(Ngram::from_slice(&buffer[i..i + 1]));
} }
if i >= 1 { if i >= 1 {
tokens.push(ByteNgram::from_slice(&buffer[i - 1..i + 1])); tokens.push(Ngram::from_slice(&buffer[i - 1..i + 1]));
} }
if i >= 2 { if i >= 2 {
tokens.push(ByteNgram::from_slice(&buffer[i - 2..i + 1])); tokens.push(Ngram::from_slice(&buffer[i - 2..i + 1]));
} }
if i >= 3 { if i >= 3 {
tokens.push(ByteNgram::from_slice(&buffer[i - 3..i + 1])); tokens.push(Ngram::from_slice(&buffer[i - 3..i + 1]));
} }
if i >= 4 { if i >= 4 {
tokens.push(ByteNgram::from_slice(&buffer[i - 4..i + 1])); tokens.push(Ngram::from_slice(&buffer[i - 4..i + 1]));
} }
if i >= 5 { if i >= 5 {
tokens.push(ByteNgram::from_slice(&buffer[i - 5..i + 1])); tokens.push(Ngram::from_slice(&buffer[i - 5..i + 1]));
} }
if i >= 6 { if i >= 6 {
tokens.push(ByteNgram::from_slice(&buffer[i - 6..i + 1])); tokens.push(Ngram::from_slice(&buffer[i - 6..i + 1]));
} }
} }
@@ -187,65 +189,62 @@ mod tests {
#[test] #[test]
fn test_lhs() { fn test_lhs() {
assert_eq!( assert_eq!(
ByteNgram::from_slice("ab").lhs(), Ngram::from_slice("ab").lhs(),
ByteNgram::from_slice("a") Ngram::from_slice("a")
); );
assert_eq!( assert_eq!(
ByteNgram::from_slice("abc").lhs(), Ngram::from_slice("abc").lhs(),
ByteNgram::from_slice("ab") Ngram::from_slice("ab")
); );
assert_eq!( assert_eq!(
ByteNgram::from_slice("abcd").lhs(), Ngram::from_slice("abcd").lhs(),
ByteNgram::from_slice("abc") Ngram::from_slice("abc")
); );
assert_eq!( assert_eq!(
ByteNgram::from_slice("abcde").lhs(), Ngram::from_slice("abcde").lhs(),
ByteNgram::from_slice("abcd") Ngram::from_slice("abcd")
); );
assert_eq!( assert_eq!(
ByteNgram::from_slice("abcdef").lhs(), Ngram::from_slice("abcdef").lhs(),
ByteNgram::from_slice("abcde") Ngram::from_slice("abcde")
); );
assert_eq!( assert_eq!(
ByteNgram::from_slice("abcdefg").lhs(), Ngram::from_slice("abcdefg").lhs(),
ByteNgram::from_slice("abcdef") Ngram::from_slice("abcdef")
); );
} }
#[test] #[test]
fn test_rhs() { fn test_rhs() {
assert_eq!( assert_eq!(
ByteNgram::from_slice("ab").rhs(), Ngram::from_slice("ab").rhs(),
ByteNgram::from_slice("b") Ngram::from_slice("b")
); );
assert_eq!( assert_eq!(
ByteNgram::from_slice("abc").rhs(), Ngram::from_slice("abc").rhs(),
ByteNgram::from_slice("bc") Ngram::from_slice("bc")
); );
assert_eq!( assert_eq!(
ByteNgram::from_slice("abcd").rhs(), Ngram::from_slice("abcd").rhs(),
ByteNgram::from_slice("bcd") Ngram::from_slice("bcd")
); );
assert_eq!( assert_eq!(
ByteNgram::from_slice("abcde").rhs(), Ngram::from_slice("abcde").rhs(),
ByteNgram::from_slice("bcde") Ngram::from_slice("bcde")
); );
assert_eq!( assert_eq!(
ByteNgram::from_slice("abcdef").rhs(), Ngram::from_slice("abcdef").rhs(),
ByteNgram::from_slice("bcdef") Ngram::from_slice("bcdef")
); );
assert_eq!( assert_eq!(
ByteNgram::from_slice("abcdefg").rhs(), Ngram::from_slice("abcdefg").rhs(),
ByteNgram::from_slice("bcdefg") Ngram::from_slice("bcdefg")
); );
} }
#[test] #[test]
fn test_bytes() { fn test_bytes() {
assert_eq!( assert_eq!(Ngram::from_slice("ab").bytes(), vec![97, 98]);
ByteNgram::from_slice("ab").bytes(),
vec![97, 98]
);
} }
#[test] #[test]
@@ -253,7 +252,7 @@ mod tests {
let data = b"abc"; let data = b"abc";
let mut a = from_slice(&data); let mut a = from_slice(&data);
let mut b = ByteNgramReader::new(&data).collect::<Vec<_>>(); let mut b = Ngrams::from(&data).collect::<Vec<_>>();
assert_eq!(a.len(), b.len()); assert_eq!(a.len(), b.len());