Initial commit.

This commit is contained in:
2018-01-09 11:33:39 +01:00
commit 92fa9401cf
5 changed files with 291 additions and 0 deletions

3
.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
/target/
Cargo.lock

14
.gitlab-ci.yml Normal file
View File

@@ -0,0 +1,14 @@
image: "rust:latest"
variables:
CARGO_HOME: $CI_PROJECT_DIR/cargo
test:cargo:
script:
- rustc --version && cargo --version
- time cargo test --verbose --jobs 1 --release
cache:
paths:
- target/
- cargo/

19
Cargo.toml Normal file
View File

@@ -0,0 +1,19 @@
[package]
name = "byte-ngram"
version = "0.1.0"
authors = ["logaritmisk <anders.e.olsson@gmail.com>"]
[dependencies]
serde = "1.0"
serde_derive = "1.0"
[dev-dependencies]
criterion = { git = "https://github.com/japaric/criterion.rs.git" }
rand = "0.4"
[[bench]]
name = "bench"
harness = false
[profile.release]
lto = true

22
benches/bench.rs Normal file
View File

@@ -0,0 +1,22 @@
#[macro_use]
extern crate criterion;
extern crate byte_ngram;
use criterion::{Criterion, Fun};
use byte_ngram::{ByteNgramReader, from_slice};
fn criterion_benchmark(c: &mut Criterion) {
let data = String::from("Blackmail is such an ugly word. I prefer extortion. The 'x' makes it sound cool.");
let fn_from_slice = Fun::new("from_slice", |b, i: &String| b.iter(|| for _ in from_slice(i.as_bytes()) {}));
let fn_from_reader = Fun::new("ByteNgramReader", |b, i: &String| b.iter(|| for _ in ByteNgramReader::new(i.as_bytes()) {}));
let functions = vec![fn_from_slice, fn_from_reader];
c.bench_functions("Read", functions, &data);
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

233
src/lib.rs Normal file
View File

@@ -0,0 +1,233 @@
#[macro_use]
extern crate serde_derive;
use std::io::{Bytes, Read};
use std::fmt;
use std::cmp;
use std::ops;
use std::convert;
#[derive(Hash, PartialEq, Eq, Clone, Copy, PartialOrd, Ord, Serialize, Deserialize)]
pub struct ByteNgram(u64);
impl ByteNgram {
pub fn from_slice<T>(data: T) -> ByteNgram
where
T: AsRef<[u8]>,
{
debug_assert!(data.as_ref().len() <= 7);
let mut token = 0;
for byte in data.as_ref() {
token += u64::from(*byte);
token <<= 8;
}
token += data.as_ref().len() as u64;
ByteNgram(token)
}
pub fn dim(&self) -> usize {
(self.0 & 0xFF) as usize
}
pub fn bytes(&self) -> Vec<u8> {
let mut bytes = Vec::with_capacity(self.dim());
let dim = self.dim() + 1;
for n in (1..dim).rev() {
bytes.push(((self.0 >> (8 * n)) & 0xFF) as u8);
}
bytes
}
pub fn lhs(&self) -> ByteNgram {
ByteNgram(match self.dim() {
2 => (self.0 >> 8 & 0x0000_0000_0000_FF00) + (self.dim() - 1) as u64,
3 => (self.0 >> 8 & 0x0000_0000_00FF_FF00) + (self.dim() - 1) as u64,
4 => (self.0 >> 8 & 0x0000_0000_FFFF_FF00) + (self.dim() - 1) as u64,
5 => (self.0 >> 8 & 0x0000_00FF_FFFF_FF00) + (self.dim() - 1) as u64,
6 => (self.0 >> 8 & 0x0000_FFFF_FFFF_FF00) + (self.dim() - 1) as u64,
7 => (self.0 >> 8 & 0x00FF_FFFF_FFFF_FF00) + (self.dim() - 1) as u64,
_ => 0,
})
}
pub fn rhs(&self) -> ByteNgram {
ByteNgram(match self.dim() {
2 => (self.0 & 0x0000_0000_0000_FF00) + (self.dim() - 1) as u64,
3 => (self.0 & 0x0000_0000_00FF_FF00) + (self.dim() - 1) as u64,
4 => (self.0 & 0x0000_0000_FFFF_FF00) + (self.dim() - 1) as u64,
5 => (self.0 & 0x0000_00FF_FFFF_FF00) + (self.dim() - 1) as u64,
6 => (self.0 & 0x0000_FFFF_FFFF_FF00) + (self.dim() - 1) as u64,
7 => (self.0 & 0x00FF_FFFF_FFFF_FF00) + (self.dim() - 1) as u64,
_ => 0,
})
}
}
impl fmt::Debug for ByteNgram {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{:?}", self.bytes())
}
}
impl ops::Deref for ByteNgram {
type Target = u64;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl convert::From<u64> for ByteNgram {
fn from(value: u64) -> Self {
ByteNgram(value)
}
}
pub struct ByteNgramReader<R: Read> {
inner: Bytes<R>,
token: u64,
index: u8,
count: u8,
}
impl<R: Read> ByteNgramReader<R> {
pub fn new(inner: R) -> Self {
ByteNgramReader {
inner: inner.bytes(),
token: 0,
index: 0,
count: 0,
}
}
}
impl<R: Read> Iterator for ByteNgramReader<R> {
type Item = ByteNgram;
fn next(&mut self) -> Option<Self::Item> {
if self.index == 0 {
if let Some(Ok(byte)) = self.inner.next() {
self.token += u64::from(byte);
self.token <<= 8;
if self.count < 7 {
self.count += 1;
}
} else {
return None;
}
}
self.index += 1;
let token = ByteNgram(match self.index {
1 => (self.token & 0x0000_0000_0000_FF00) + 1,
2 => (self.token & 0x0000_0000_00FF_FF00) + 2,
3 => (self.token & 0x0000_0000_FFFF_FF00) + 3,
4 => (self.token & 0x0000_00FF_FFFF_FF00) + 4,
5 => (self.token & 0x0000_FFFF_FFFF_FF00) + 5,
6 => (self.token & 0x00FF_FFFF_FFFF_FF00) + 6,
7 => (self.token & 0xFFFF_FFFF_FFFF_FF00) + 7,
_ => unreachable!(),
});
self.index %= self.count;
Some(token)
}
}
pub fn from_slice<T>(data: T) -> Vec<ByteNgram>
where
T: AsRef<[u8]>,
{
let buffer = data.as_ref();
let mut tokens = Vec::with_capacity(28 + cmp::max((buffer.len() as i64 - 7) * 7, 1) as usize);
for i in 0..buffer.len() {
{
tokens.push(ByteNgram::from_slice(&buffer[i..i + 1]));
}
if i >= 1 {
tokens.push(ByteNgram::from_slice(&buffer[i - 1..i + 1]));
}
if i >= 2 {
tokens.push(ByteNgram::from_slice(&buffer[i - 2..i + 1]));
}
if i >= 3 {
tokens.push(ByteNgram::from_slice(&buffer[i - 3..i + 1]));
}
if i >= 4 {
tokens.push(ByteNgram::from_slice(&buffer[i - 4..i + 1]));
}
if i >= 5 {
tokens.push(ByteNgram::from_slice(&buffer[i - 5..i + 1]));
}
if i >= 6 {
tokens.push(ByteNgram::from_slice(&buffer[i - 6..i + 1]));
}
}
tokens
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_lhs() {
assert_eq!(ByteNgram::from_slice("ab").lhs(), ByteNgram::from_slice("a"));
assert_eq!(ByteNgram::from_slice("abc").lhs(), ByteNgram::from_slice("ab"));
assert_eq!(ByteNgram::from_slice("abcd").lhs(), ByteNgram::from_slice("abc"));
assert_eq!(ByteNgram::from_slice("abcde").lhs(), ByteNgram::from_slice("abcd"));
assert_eq!(
ByteNgram::from_slice("abcdef").lhs(),
ByteNgram::from_slice("abcde")
);
assert_eq!(
ByteNgram::from_slice("abcdefg").lhs(),
ByteNgram::from_slice("abcdef")
);
}
#[test]
fn test_rhs() {
assert_eq!(ByteNgram::from_slice("ab").rhs(), ByteNgram::from_slice("b"));
assert_eq!(ByteNgram::from_slice("abc").rhs(), ByteNgram::from_slice("bc"));
assert_eq!(ByteNgram::from_slice("abcd").rhs(), ByteNgram::from_slice("bcd"));
assert_eq!(ByteNgram::from_slice("abcde").rhs(), ByteNgram::from_slice("bcde"));
assert_eq!(
ByteNgram::from_slice("abcdef").rhs(),
ByteNgram::from_slice("bcdef")
);
assert_eq!(
ByteNgram::from_slice("abcdefg").rhs(),
ByteNgram::from_slice("bcdefg")
);
}
#[test]
fn test_reader() {
let data = b"abcdef";
let a = from_slice(&data);
let b = ByteNgramReader::new(&data[..]).collect::<Vec<_>>();
assert_eq!(a, b);
}
}