From 8fd169ed3f07acf82847ee1105738ab171ac5808 Mon Sep 17 00:00:00 2001 From: logaritmisk Date: Tue, 9 Jan 2018 18:21:08 +0100 Subject: [PATCH] ByteNgramReader is almost as fast as from_slice. --- benches/bench.rs | 16 ++++---- src/lib.rs | 101 ++++++++++++++++++++++++++++++++++------------- 2 files changed, 80 insertions(+), 37 deletions(-) diff --git a/benches/bench.rs b/benches/bench.rs index 9057efa..8347c78 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -3,19 +3,17 @@ extern crate criterion; extern crate byte_ngram; -use criterion::{Criterion, Fun}; +use criterion::Criterion; -use byte_ngram::{ByteNgramReader, from_slice}; +use byte_ngram::*; fn criterion_benchmark(c: &mut Criterion) { - let data = String::from("Blackmail is such an ugly word. I prefer extortion. The 'x' makes it sound cool."); + let data = String::from( + "Blackmail is such an ugly word. I prefer extortion. The 'x' makes it sound cool.", + ); - let fn_from_slice = Fun::new("from_slice", |b, i: &String| b.iter(|| for _ in from_slice(i.as_bytes()) {})); - let fn_from_reader = Fun::new("ByteNgramReader", |b, i: &String| b.iter(|| for _ in ByteNgramReader::new(i.as_bytes()) {})); - - let functions = vec![fn_from_slice, fn_from_reader]; - - c.bench_functions("Read", functions, &data); + c.bench_function("from_slice", |b| b.iter(|| for _ in from_slice(data.as_bytes()) {})); + c.bench_function("ByteNgramReader", |b| b.iter(|| for _ in ByteNgramReader::new(data.as_bytes()) {})); } criterion_group!(benches, criterion_benchmark); diff --git a/src/lib.rs b/src/lib.rs index 1aa8fb4..06c06f7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -93,8 +93,9 @@ impl convert::From for ByteNgram { pub struct ByteNgramReader { inner: Bytes, token: u64, - index: u8, - count: u8, + count: u64, + index: u64, + mask: u64, } impl ByteNgramReader { @@ -102,8 +103,9 @@ impl ByteNgramReader { ByteNgramReader { inner: inner.bytes(), token: 0, - index: 0, count: 0, + index: 0, + mask: 0, } } } @@ -119,26 +121,22 @@ impl Iterator for ByteNgramReader { if self.count < 7 { self.count += 1; + + self.index = self.count; + self.mask = (256u64.pow(self.count as u32) - 1) << 8; + } else { + self.index = 7; + self.mask = 0xFFFF_FFFF_FFFF_FFFF; } } else { return None; } } - self.index += 1; + let token = ByteNgram((self.token & self.mask & !0xFF) + self.index); - let token = ByteNgram(match self.index { - 1 => (self.token & 0x0000_0000_0000_FF00) + 1, - 2 => (self.token & 0x0000_0000_00FF_FF00) + 2, - 3 => (self.token & 0x0000_0000_FFFF_FF00) + 3, - 4 => (self.token & 0x0000_00FF_FFFF_FF00) + 4, - 5 => (self.token & 0x0000_FFFF_FFFF_FF00) + 5, - 6 => (self.token & 0x00FF_FFFF_FFFF_FF00) + 6, - 7 => (self.token & 0xFFFF_FFFF_FFFF_FF00) + 7, - _ => unreachable!(), - }); - - self.index %= self.count; + self.index -= 1; + self.mask >>= 8; Some(token) } @@ -191,10 +189,22 @@ mod tests { #[test] fn test_lhs() { - assert_eq!(ByteNgram::from_slice("ab").lhs(), ByteNgram::from_slice("a")); - assert_eq!(ByteNgram::from_slice("abc").lhs(), ByteNgram::from_slice("ab")); - assert_eq!(ByteNgram::from_slice("abcd").lhs(), ByteNgram::from_slice("abc")); - assert_eq!(ByteNgram::from_slice("abcde").lhs(), ByteNgram::from_slice("abcd")); + assert_eq!( + ByteNgram::from_slice("ab").lhs(), + ByteNgram::from_slice("a") + ); + assert_eq!( + ByteNgram::from_slice("abc").lhs(), + ByteNgram::from_slice("ab") + ); + assert_eq!( + ByteNgram::from_slice("abcd").lhs(), + ByteNgram::from_slice("abc") + ); + assert_eq!( + ByteNgram::from_slice("abcde").lhs(), + ByteNgram::from_slice("abcd") + ); assert_eq!( ByteNgram::from_slice("abcdef").lhs(), ByteNgram::from_slice("abcde") @@ -207,10 +217,22 @@ mod tests { #[test] fn test_rhs() { - assert_eq!(ByteNgram::from_slice("ab").rhs(), ByteNgram::from_slice("b")); - assert_eq!(ByteNgram::from_slice("abc").rhs(), ByteNgram::from_slice("bc")); - assert_eq!(ByteNgram::from_slice("abcd").rhs(), ByteNgram::from_slice("bcd")); - assert_eq!(ByteNgram::from_slice("abcde").rhs(), ByteNgram::from_slice("bcde")); + assert_eq!( + ByteNgram::from_slice("ab").rhs(), + ByteNgram::from_slice("b") + ); + assert_eq!( + ByteNgram::from_slice("abc").rhs(), + ByteNgram::from_slice("bc") + ); + assert_eq!( + ByteNgram::from_slice("abcd").rhs(), + ByteNgram::from_slice("bcd") + ); + assert_eq!( + ByteNgram::from_slice("abcde").rhs(), + ByteNgram::from_slice("bcde") + ); assert_eq!( ByteNgram::from_slice("abcdef").rhs(), ByteNgram::from_slice("bcdef") @@ -222,11 +244,34 @@ mod tests { } #[test] - fn test_reader() { - let data = b"abcdef"; + fn test_bytes() { + assert_eq!( + ByteNgram::from_slice("ab").bytes(), + vec![97, 98] + ); + } - let a = from_slice(&data); - let b = ByteNgramReader::new(&data[..]).collect::>(); + #[test] + fn test_reader() { + let data = b"abc"; + + let mut a = from_slice(&data); + let mut b = ByteNgramReader::new(&data[..]).collect::>(); + + assert_eq!(a.len(), b.len()); + + a.sort(); + b.sort(); + + println!("a"); + for token in &a { + println!("{:64b}", token.0); + } + + println!("b"); + for token in &b { + println!("{:64b}", token.0); + } assert_eq!(a, b); }