ByteNgramReader is almost as fast as from_slice.

This commit is contained in:
2018-01-09 18:21:08 +01:00
parent 92fa9401cf
commit 8fd169ed3f
2 changed files with 80 additions and 37 deletions

View File

@@ -3,19 +3,17 @@ extern crate criterion;
extern crate byte_ngram; extern crate byte_ngram;
use criterion::{Criterion, Fun}; use criterion::Criterion;
use byte_ngram::{ByteNgramReader, from_slice}; use byte_ngram::*;
fn criterion_benchmark(c: &mut Criterion) { fn criterion_benchmark(c: &mut Criterion) {
let data = String::from("Blackmail is such an ugly word. I prefer extortion. The 'x' makes it sound cool."); let data = String::from(
"Blackmail is such an ugly word. I prefer extortion. The 'x' makes it sound cool.",
);
let fn_from_slice = Fun::new("from_slice", |b, i: &String| b.iter(|| for _ in from_slice(i.as_bytes()) {})); c.bench_function("from_slice", |b| b.iter(|| for _ in from_slice(data.as_bytes()) {}));
let fn_from_reader = Fun::new("ByteNgramReader", |b, i: &String| b.iter(|| for _ in ByteNgramReader::new(i.as_bytes()) {})); c.bench_function("ByteNgramReader", |b| b.iter(|| for _ in ByteNgramReader::new(data.as_bytes()) {}));
let functions = vec![fn_from_slice, fn_from_reader];
c.bench_functions("Read", functions, &data);
} }
criterion_group!(benches, criterion_benchmark); criterion_group!(benches, criterion_benchmark);

View File

@@ -93,8 +93,9 @@ impl convert::From<u64> for ByteNgram {
pub struct ByteNgramReader<R: Read> { pub struct ByteNgramReader<R: Read> {
inner: Bytes<R>, inner: Bytes<R>,
token: u64, token: u64,
index: u8, count: u64,
count: u8, index: u64,
mask: u64,
} }
impl<R: Read> ByteNgramReader<R> { impl<R: Read> ByteNgramReader<R> {
@@ -102,8 +103,9 @@ impl<R: Read> ByteNgramReader<R> {
ByteNgramReader { ByteNgramReader {
inner: inner.bytes(), inner: inner.bytes(),
token: 0, token: 0,
index: 0,
count: 0, count: 0,
index: 0,
mask: 0,
} }
} }
} }
@@ -119,26 +121,22 @@ impl<R: Read> Iterator for ByteNgramReader<R> {
if self.count < 7 { if self.count < 7 {
self.count += 1; self.count += 1;
self.index = self.count;
self.mask = (256u64.pow(self.count as u32) - 1) << 8;
} else {
self.index = 7;
self.mask = 0xFFFF_FFFF_FFFF_FFFF;
} }
} else { } else {
return None; return None;
} }
} }
self.index += 1; let token = ByteNgram((self.token & self.mask & !0xFF) + self.index);
let token = ByteNgram(match self.index { self.index -= 1;
1 => (self.token & 0x0000_0000_0000_FF00) + 1, self.mask >>= 8;
2 => (self.token & 0x0000_0000_00FF_FF00) + 2,
3 => (self.token & 0x0000_0000_FFFF_FF00) + 3,
4 => (self.token & 0x0000_00FF_FFFF_FF00) + 4,
5 => (self.token & 0x0000_FFFF_FFFF_FF00) + 5,
6 => (self.token & 0x00FF_FFFF_FFFF_FF00) + 6,
7 => (self.token & 0xFFFF_FFFF_FFFF_FF00) + 7,
_ => unreachable!(),
});
self.index %= self.count;
Some(token) Some(token)
} }
@@ -191,10 +189,22 @@ mod tests {
#[test] #[test]
fn test_lhs() { fn test_lhs() {
assert_eq!(ByteNgram::from_slice("ab").lhs(), ByteNgram::from_slice("a")); assert_eq!(
assert_eq!(ByteNgram::from_slice("abc").lhs(), ByteNgram::from_slice("ab")); ByteNgram::from_slice("ab").lhs(),
assert_eq!(ByteNgram::from_slice("abcd").lhs(), ByteNgram::from_slice("abc")); ByteNgram::from_slice("a")
assert_eq!(ByteNgram::from_slice("abcde").lhs(), ByteNgram::from_slice("abcd")); );
assert_eq!(
ByteNgram::from_slice("abc").lhs(),
ByteNgram::from_slice("ab")
);
assert_eq!(
ByteNgram::from_slice("abcd").lhs(),
ByteNgram::from_slice("abc")
);
assert_eq!(
ByteNgram::from_slice("abcde").lhs(),
ByteNgram::from_slice("abcd")
);
assert_eq!( assert_eq!(
ByteNgram::from_slice("abcdef").lhs(), ByteNgram::from_slice("abcdef").lhs(),
ByteNgram::from_slice("abcde") ByteNgram::from_slice("abcde")
@@ -207,10 +217,22 @@ mod tests {
#[test] #[test]
fn test_rhs() { fn test_rhs() {
assert_eq!(ByteNgram::from_slice("ab").rhs(), ByteNgram::from_slice("b")); assert_eq!(
assert_eq!(ByteNgram::from_slice("abc").rhs(), ByteNgram::from_slice("bc")); ByteNgram::from_slice("ab").rhs(),
assert_eq!(ByteNgram::from_slice("abcd").rhs(), ByteNgram::from_slice("bcd")); ByteNgram::from_slice("b")
assert_eq!(ByteNgram::from_slice("abcde").rhs(), ByteNgram::from_slice("bcde")); );
assert_eq!(
ByteNgram::from_slice("abc").rhs(),
ByteNgram::from_slice("bc")
);
assert_eq!(
ByteNgram::from_slice("abcd").rhs(),
ByteNgram::from_slice("bcd")
);
assert_eq!(
ByteNgram::from_slice("abcde").rhs(),
ByteNgram::from_slice("bcde")
);
assert_eq!( assert_eq!(
ByteNgram::from_slice("abcdef").rhs(), ByteNgram::from_slice("abcdef").rhs(),
ByteNgram::from_slice("bcdef") ByteNgram::from_slice("bcdef")
@@ -222,11 +244,34 @@ mod tests {
} }
#[test] #[test]
fn test_reader() { fn test_bytes() {
let data = b"abcdef"; assert_eq!(
ByteNgram::from_slice("ab").bytes(),
vec![97, 98]
);
}
let a = from_slice(&data); #[test]
let b = ByteNgramReader::new(&data[..]).collect::<Vec<_>>(); fn test_reader() {
let data = b"abc";
let mut a = from_slice(&data);
let mut b = ByteNgramReader::new(&data[..]).collect::<Vec<_>>();
assert_eq!(a.len(), b.len());
a.sort();
b.sort();
println!("a");
for token in &a {
println!("{:64b}", token.0);
}
println!("b");
for token in &b {
println!("{:64b}", token.0);
}
assert_eq!(a, b); assert_eq!(a, b);
} }