diff --git a/benches/bench.rs b/benches/bench.rs index d4883d8..ec038e5 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -12,8 +12,12 @@ fn criterion_benchmark(c: &mut Criterion) { let len = from_slice(&data[..]).len(); - c.bench_function("from_slice", |b| b.iter(|| assert_eq!(len, from_slice(&data[..]).len()))); - c.bench_function("ByteNgramReader", |b| b.iter(|| assert_eq!(len, ByteNgramReader::new(&data).count()))); + c.bench_function("from_slice", |b| { + b.iter(|| assert_eq!(len, from_slice(&data[..]).len())) + }); + c.bench_function("ByteNgramReader", |b| { + b.iter(|| assert_eq!(len, Ngrams::from(&data).count())) + }); } criterion_group!(benches, criterion_benchmark); diff --git a/src/lib.rs b/src/lib.rs index 2ef8b82..01a32bb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,10 +8,10 @@ use std::convert; use std::slice::Iter; #[derive(Hash, PartialEq, Eq, Clone, Copy, PartialOrd, Ord, Serialize, Deserialize)] -pub struct ByteNgram(u64); +pub struct Ngram(u64); -impl ByteNgram { - pub fn from_slice(data: T) -> ByteNgram +impl Ngram { + pub fn from_slice(data: T) -> Ngram where T: AsRef<[u8]>, { @@ -26,7 +26,7 @@ impl ByteNgram { token += data.as_ref().len() as u64; - ByteNgram(token) + Ngram(token) } pub fn dim(&self) -> usize { @@ -45,8 +45,8 @@ impl ByteNgram { bytes } - pub fn lhs(&self) -> ByteNgram { - ByteNgram(match self.dim() { + pub fn lhs(&self) -> Ngram { + Ngram(match self.dim() { 2 => (self.0 >> 8 & 0x0000_0000_0000_FF00) + (self.dim() - 1) as u64, 3 => (self.0 >> 8 & 0x0000_0000_00FF_FF00) + (self.dim() - 1) as u64, 4 => (self.0 >> 8 & 0x0000_0000_FFFF_FF00) + (self.dim() - 1) as u64, @@ -57,8 +57,8 @@ impl ByteNgram { }) } - pub fn rhs(&self) -> ByteNgram { - ByteNgram(match self.dim() { + pub fn rhs(&self) -> Ngram { + Ngram(match self.dim() { 2 => (self.0 & 0x0000_0000_0000_FF00) + (self.dim() - 1) as u64, 3 => (self.0 & 0x0000_0000_00FF_FF00) + (self.dim() - 1) as u64, 4 => (self.0 & 0x0000_0000_FFFF_FF00) + (self.dim() - 1) as u64, @@ -70,13 +70,13 @@ impl ByteNgram { } } -impl fmt::Debug for ByteNgram { +impl fmt::Debug for Ngram { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{:?}", self.bytes()) } } -impl ops::Deref for ByteNgram { +impl ops::Deref for Ngram { type Target = u64; fn deref(&self) -> &Self::Target { @@ -84,24 +84,26 @@ impl ops::Deref for ByteNgram { } } -impl convert::From for ByteNgram { +impl convert::From for Ngram { fn from(value: u64) -> Self { - ByteNgram(value) + Ngram(value) } } -pub struct ByteNgramReader<'a> { +pub struct Ngrams<'a> { inner: Iter<'a, u8>, count: u64, token: u64, mask: u64, } -impl<'a> ByteNgramReader<'a> { - #[inline] - pub fn new(inner: &'a T) -> Self where T: AsRef<[u8]> { - ByteNgramReader { - inner: inner.as_ref().iter(), +impl<'a, T> convert::From<&'a T> for Ngrams<'a> +where + T: AsRef<[u8]>, +{ + fn from(src: &'a T) -> Ngrams<'a> { + Ngrams { + inner: src.as_ref().iter(), count: 0, token: 0, mask: 0, @@ -109,8 +111,8 @@ impl<'a> ByteNgramReader<'a> { } } -impl<'a> Iterator for ByteNgramReader<'a> { - type Item = ByteNgram; +impl<'a> Iterator for Ngrams<'a> { + type Item = Ngram; #[inline] fn next(&mut self) -> Option { @@ -131,7 +133,7 @@ impl<'a> Iterator for ByteNgramReader<'a> { } } - let token = ByteNgram((self.token & self.mask & !0xFF) + (self.mask & 0xFF)); + let token = Ngram((self.token & self.mask & !0xFF) + (self.mask & 0xFF)); self.mask = ((self.mask >> 8) & !0xFF) + (self.mask & 0xFF) - 1; @@ -139,7 +141,7 @@ impl<'a> Iterator for ByteNgramReader<'a> { } } -pub fn from_slice(data: T) -> Vec +pub fn from_slice(data: T) -> Vec where T: AsRef<[u8]>, { @@ -149,31 +151,31 @@ where for i in 0..buffer.len() { { - tokens.push(ByteNgram::from_slice(&buffer[i..i + 1])); + tokens.push(Ngram::from_slice(&buffer[i..i + 1])); } if i >= 1 { - tokens.push(ByteNgram::from_slice(&buffer[i - 1..i + 1])); + tokens.push(Ngram::from_slice(&buffer[i - 1..i + 1])); } if i >= 2 { - tokens.push(ByteNgram::from_slice(&buffer[i - 2..i + 1])); + tokens.push(Ngram::from_slice(&buffer[i - 2..i + 1])); } if i >= 3 { - tokens.push(ByteNgram::from_slice(&buffer[i - 3..i + 1])); + tokens.push(Ngram::from_slice(&buffer[i - 3..i + 1])); } if i >= 4 { - tokens.push(ByteNgram::from_slice(&buffer[i - 4..i + 1])); + tokens.push(Ngram::from_slice(&buffer[i - 4..i + 1])); } if i >= 5 { - tokens.push(ByteNgram::from_slice(&buffer[i - 5..i + 1])); + tokens.push(Ngram::from_slice(&buffer[i - 5..i + 1])); } if i >= 6 { - tokens.push(ByteNgram::from_slice(&buffer[i - 6..i + 1])); + tokens.push(Ngram::from_slice(&buffer[i - 6..i + 1])); } } @@ -187,65 +189,62 @@ mod tests { #[test] fn test_lhs() { assert_eq!( - ByteNgram::from_slice("ab").lhs(), - ByteNgram::from_slice("a") + Ngram::from_slice("ab").lhs(), + Ngram::from_slice("a") ); assert_eq!( - ByteNgram::from_slice("abc").lhs(), - ByteNgram::from_slice("ab") + Ngram::from_slice("abc").lhs(), + Ngram::from_slice("ab") ); assert_eq!( - ByteNgram::from_slice("abcd").lhs(), - ByteNgram::from_slice("abc") + Ngram::from_slice("abcd").lhs(), + Ngram::from_slice("abc") ); assert_eq!( - ByteNgram::from_slice("abcde").lhs(), - ByteNgram::from_slice("abcd") + Ngram::from_slice("abcde").lhs(), + Ngram::from_slice("abcd") ); assert_eq!( - ByteNgram::from_slice("abcdef").lhs(), - ByteNgram::from_slice("abcde") + Ngram::from_slice("abcdef").lhs(), + Ngram::from_slice("abcde") ); assert_eq!( - ByteNgram::from_slice("abcdefg").lhs(), - ByteNgram::from_slice("abcdef") + Ngram::from_slice("abcdefg").lhs(), + Ngram::from_slice("abcdef") ); } #[test] fn test_rhs() { assert_eq!( - ByteNgram::from_slice("ab").rhs(), - ByteNgram::from_slice("b") + Ngram::from_slice("ab").rhs(), + Ngram::from_slice("b") ); assert_eq!( - ByteNgram::from_slice("abc").rhs(), - ByteNgram::from_slice("bc") + Ngram::from_slice("abc").rhs(), + Ngram::from_slice("bc") ); assert_eq!( - ByteNgram::from_slice("abcd").rhs(), - ByteNgram::from_slice("bcd") + Ngram::from_slice("abcd").rhs(), + Ngram::from_slice("bcd") ); assert_eq!( - ByteNgram::from_slice("abcde").rhs(), - ByteNgram::from_slice("bcde") + Ngram::from_slice("abcde").rhs(), + Ngram::from_slice("bcde") ); assert_eq!( - ByteNgram::from_slice("abcdef").rhs(), - ByteNgram::from_slice("bcdef") + Ngram::from_slice("abcdef").rhs(), + Ngram::from_slice("bcdef") ); assert_eq!( - ByteNgram::from_slice("abcdefg").rhs(), - ByteNgram::from_slice("bcdefg") + Ngram::from_slice("abcdefg").rhs(), + Ngram::from_slice("bcdefg") ); } #[test] fn test_bytes() { - assert_eq!( - ByteNgram::from_slice("ab").bytes(), - vec![97, 98] - ); + assert_eq!(Ngram::from_slice("ab").bytes(), vec![97, 98]); } #[test] @@ -253,7 +252,7 @@ mod tests { let data = b"abc"; let mut a = from_slice(&data); - let mut b = ByteNgramReader::new(&data).collect::>(); + let mut b = Ngrams::from(&data).collect::>(); assert_eq!(a.len(), b.len());