Initial commit.

This commit is contained in:
2019-03-15 13:31:20 +01:00
commit daac79b169
5 changed files with 647061 additions and 0 deletions

3
.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
/target
**/*.rs.bk
Cargo.lock

8
Cargo.toml Normal file
View File

@@ -0,0 +1,8 @@
[package]
name = "dict-tei"
version = "0.1.0"
authors = ["Anders Olsson <anders.e.olsson@gmail.com>"]
edition = "2018"
[dependencies]
quick-xml = "0.13"

477088
data/pol-eng.tei Normal file

File diff suppressed because it is too large Load Diff

169819
data/swe-pol.tei Normal file

File diff suppressed because it is too large Load Diff

143
src/lib.rs Normal file
View File

@@ -0,0 +1,143 @@
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::{Path, PathBuf};
use quick_xml::events::Event;
use quick_xml::Reader as XmlReader;
pub struct Reader<B: BufRead> {
inner: XmlReader<B>,
base_path: Option<PathBuf>,
include_handler: Box<dyn Fn(&Path, Option<&Path>) -> Vec<u8>>,
}
impl Reader<BufReader<File>> {
pub fn from_path<P>(path: P) -> Reader<BufReader<File>>
where
P: Into<PathBuf>,
{
let path = path.into();
let inner = XmlReader::from_file(&path).expect("failed to open path");
Reader {
inner: inner,
base_path: Some(path),
include_handler: Box::new(|_, _| vec![]),
}
}
}
impl<'a> Reader<&'a [u8]> {
pub fn from_str(s: &str) -> Reader<&[u8]> {
Reader {
inner: XmlReader::from_str(s),
base_path: None,
include_handler: Box::new(|_, _| vec![]),
}
}
}
impl<B: BufRead> Reader<B> {
pub fn parse(&mut self) {
enum State {
None,
Definition(String, Vec<String>),
}
self.inner.trim_text(true);
let mut buf = Vec::new();
let mut state = State::None;
let mut words = Vec::new();
loop {
state = match (state, self.inner.read_event(&mut buf)) {
(State::None, Ok(Event::Start(ref e))) => match e.name() {
b"orth" => {
let word = self.inner.read_text(e.name(), &mut Vec::new()).unwrap();
State::Definition(word, Vec::new())
}
_ => State::None,
},
(State::Definition(word, mut translations), Ok(Event::Start(e))) => {
match e.name() {
b"orth" => {
words.push((word, translations));
let word = self.inner.read_text(e.name(), &mut Vec::new()).unwrap();
State::Definition(word, Vec::new())
}
b"quote" => {
let translation =
self.inner.read_text(e.name(), &mut Vec::new()).unwrap();
translations.push(translation);
State::Definition(word, translations)
}
_ => State::Definition(word, translations),
}
}
(_, Err(e)) => panic!(
"Error at position {}: {:?}",
self.inner.buffer_position(),
e
),
(_, Ok(Event::Eof)) => break,
(state, _) => state,
};
buf.clear();
}
println!("{:#?}", words);
}
pub fn base_path<P>(&mut self, path: P)
where
P: Into<PathBuf>,
{
self.base_path = Some(path.into());
}
pub fn include_handler<F>(&mut self, f: F)
where
F: Fn(&Path, Option<&Path>) -> Vec<u8> + 'static,
{
self.include_handler = Box::new(f);
}
}
#[cfg(test)]
mod tests {
use super::*;
// #[test]
fn it_works() {
let fixture = include_str!("../data/pol-eng.tei");
let mut reader = Reader::from_str(&fixture);
reader.parse();
assert!(true == false);
}
#[test]
fn test_include_handler() {
let mut reader = Reader::from_path("data/pol-eng.tei");
reader.include_handler(|path, _| {
println!("path: {:?}", path);
vec![]
});
reader.parse();
assert!(true == false);
}
}