commit 92fa9401cff683032dd2354dafad6706c481cb55 Author: logaritmisk Date: Tue Jan 9 11:33:39 2018 +0100 Initial commit. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e13de17 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/target/ + +Cargo.lock diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..e92d30c --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,14 @@ +image: "rust:latest" + +variables: + CARGO_HOME: $CI_PROJECT_DIR/cargo + +test:cargo: + script: + - rustc --version && cargo --version + - time cargo test --verbose --jobs 1 --release + + cache: + paths: + - target/ + - cargo/ diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..6eb50e4 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "byte-ngram" +version = "0.1.0" +authors = ["logaritmisk "] + +[dependencies] +serde = "1.0" +serde_derive = "1.0" + +[dev-dependencies] +criterion = { git = "https://github.com/japaric/criterion.rs.git" } +rand = "0.4" + +[[bench]] +name = "bench" +harness = false + +[profile.release] +lto = true diff --git a/benches/bench.rs b/benches/bench.rs new file mode 100644 index 0000000..9057efa --- /dev/null +++ b/benches/bench.rs @@ -0,0 +1,22 @@ +#[macro_use] +extern crate criterion; + +extern crate byte_ngram; + +use criterion::{Criterion, Fun}; + +use byte_ngram::{ByteNgramReader, from_slice}; + +fn criterion_benchmark(c: &mut Criterion) { + let data = String::from("Blackmail is such an ugly word. I prefer extortion. The 'x' makes it sound cool."); + + let fn_from_slice = Fun::new("from_slice", |b, i: &String| b.iter(|| for _ in from_slice(i.as_bytes()) {})); + let fn_from_reader = Fun::new("ByteNgramReader", |b, i: &String| b.iter(|| for _ in ByteNgramReader::new(i.as_bytes()) {})); + + let functions = vec![fn_from_slice, fn_from_reader]; + + c.bench_functions("Read", functions, &data); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..1aa8fb4 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,233 @@ +#[macro_use] +extern crate serde_derive; + +use std::io::{Bytes, Read}; +use std::fmt; +use std::cmp; +use std::ops; +use std::convert; + +#[derive(Hash, PartialEq, Eq, Clone, Copy, PartialOrd, Ord, Serialize, Deserialize)] +pub struct ByteNgram(u64); + +impl ByteNgram { + pub fn from_slice(data: T) -> ByteNgram + where + T: AsRef<[u8]>, + { + debug_assert!(data.as_ref().len() <= 7); + + let mut token = 0; + + for byte in data.as_ref() { + token += u64::from(*byte); + token <<= 8; + } + + token += data.as_ref().len() as u64; + + ByteNgram(token) + } + + pub fn dim(&self) -> usize { + (self.0 & 0xFF) as usize + } + + pub fn bytes(&self) -> Vec { + let mut bytes = Vec::with_capacity(self.dim()); + + let dim = self.dim() + 1; + + for n in (1..dim).rev() { + bytes.push(((self.0 >> (8 * n)) & 0xFF) as u8); + } + + bytes + } + + pub fn lhs(&self) -> ByteNgram { + ByteNgram(match self.dim() { + 2 => (self.0 >> 8 & 0x0000_0000_0000_FF00) + (self.dim() - 1) as u64, + 3 => (self.0 >> 8 & 0x0000_0000_00FF_FF00) + (self.dim() - 1) as u64, + 4 => (self.0 >> 8 & 0x0000_0000_FFFF_FF00) + (self.dim() - 1) as u64, + 5 => (self.0 >> 8 & 0x0000_00FF_FFFF_FF00) + (self.dim() - 1) as u64, + 6 => (self.0 >> 8 & 0x0000_FFFF_FFFF_FF00) + (self.dim() - 1) as u64, + 7 => (self.0 >> 8 & 0x00FF_FFFF_FFFF_FF00) + (self.dim() - 1) as u64, + _ => 0, + }) + } + + pub fn rhs(&self) -> ByteNgram { + ByteNgram(match self.dim() { + 2 => (self.0 & 0x0000_0000_0000_FF00) + (self.dim() - 1) as u64, + 3 => (self.0 & 0x0000_0000_00FF_FF00) + (self.dim() - 1) as u64, + 4 => (self.0 & 0x0000_0000_FFFF_FF00) + (self.dim() - 1) as u64, + 5 => (self.0 & 0x0000_00FF_FFFF_FF00) + (self.dim() - 1) as u64, + 6 => (self.0 & 0x0000_FFFF_FFFF_FF00) + (self.dim() - 1) as u64, + 7 => (self.0 & 0x00FF_FFFF_FFFF_FF00) + (self.dim() - 1) as u64, + _ => 0, + }) + } +} + +impl fmt::Debug for ByteNgram { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self.bytes()) + } +} + +impl ops::Deref for ByteNgram { + type Target = u64; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl convert::From for ByteNgram { + fn from(value: u64) -> Self { + ByteNgram(value) + } +} + +pub struct ByteNgramReader { + inner: Bytes, + token: u64, + index: u8, + count: u8, +} + +impl ByteNgramReader { + pub fn new(inner: R) -> Self { + ByteNgramReader { + inner: inner.bytes(), + token: 0, + index: 0, + count: 0, + } + } +} + +impl Iterator for ByteNgramReader { + type Item = ByteNgram; + + fn next(&mut self) -> Option { + if self.index == 0 { + if let Some(Ok(byte)) = self.inner.next() { + self.token += u64::from(byte); + self.token <<= 8; + + if self.count < 7 { + self.count += 1; + } + } else { + return None; + } + } + + self.index += 1; + + let token = ByteNgram(match self.index { + 1 => (self.token & 0x0000_0000_0000_FF00) + 1, + 2 => (self.token & 0x0000_0000_00FF_FF00) + 2, + 3 => (self.token & 0x0000_0000_FFFF_FF00) + 3, + 4 => (self.token & 0x0000_00FF_FFFF_FF00) + 4, + 5 => (self.token & 0x0000_FFFF_FFFF_FF00) + 5, + 6 => (self.token & 0x00FF_FFFF_FFFF_FF00) + 6, + 7 => (self.token & 0xFFFF_FFFF_FFFF_FF00) + 7, + _ => unreachable!(), + }); + + self.index %= self.count; + + Some(token) + } +} + +pub fn from_slice(data: T) -> Vec +where + T: AsRef<[u8]>, +{ + let buffer = data.as_ref(); + + let mut tokens = Vec::with_capacity(28 + cmp::max((buffer.len() as i64 - 7) * 7, 1) as usize); + + for i in 0..buffer.len() { + { + tokens.push(ByteNgram::from_slice(&buffer[i..i + 1])); + } + + if i >= 1 { + tokens.push(ByteNgram::from_slice(&buffer[i - 1..i + 1])); + } + + if i >= 2 { + tokens.push(ByteNgram::from_slice(&buffer[i - 2..i + 1])); + } + + if i >= 3 { + tokens.push(ByteNgram::from_slice(&buffer[i - 3..i + 1])); + } + + if i >= 4 { + tokens.push(ByteNgram::from_slice(&buffer[i - 4..i + 1])); + } + + if i >= 5 { + tokens.push(ByteNgram::from_slice(&buffer[i - 5..i + 1])); + } + + if i >= 6 { + tokens.push(ByteNgram::from_slice(&buffer[i - 6..i + 1])); + } + } + + tokens +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_lhs() { + assert_eq!(ByteNgram::from_slice("ab").lhs(), ByteNgram::from_slice("a")); + assert_eq!(ByteNgram::from_slice("abc").lhs(), ByteNgram::from_slice("ab")); + assert_eq!(ByteNgram::from_slice("abcd").lhs(), ByteNgram::from_slice("abc")); + assert_eq!(ByteNgram::from_slice("abcde").lhs(), ByteNgram::from_slice("abcd")); + assert_eq!( + ByteNgram::from_slice("abcdef").lhs(), + ByteNgram::from_slice("abcde") + ); + assert_eq!( + ByteNgram::from_slice("abcdefg").lhs(), + ByteNgram::from_slice("abcdef") + ); + } + + #[test] + fn test_rhs() { + assert_eq!(ByteNgram::from_slice("ab").rhs(), ByteNgram::from_slice("b")); + assert_eq!(ByteNgram::from_slice("abc").rhs(), ByteNgram::from_slice("bc")); + assert_eq!(ByteNgram::from_slice("abcd").rhs(), ByteNgram::from_slice("bcd")); + assert_eq!(ByteNgram::from_slice("abcde").rhs(), ByteNgram::from_slice("bcde")); + assert_eq!( + ByteNgram::from_slice("abcdef").rhs(), + ByteNgram::from_slice("bcdef") + ); + assert_eq!( + ByteNgram::from_slice("abcdefg").rhs(), + ByteNgram::from_slice("bcdefg") + ); + } + + #[test] + fn test_reader() { + let data = b"abcdef"; + + let a = from_slice(&data); + let b = ByteNgramReader::new(&data[..]).collect::>(); + + assert_eq!(a, b); + } +}