Files
dict-tei/dict-tei/src/lib.rs

296 lines
8.9 KiB
Rust

use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::{Path, PathBuf};
use dict::{Entry, Language, Sense};
use quick_xml::events::Event;
use quick_xml::Reader as XmlReader;
#[derive(Clone, Copy)]
enum State {
None,
Tei,
Text,
Body,
Entry,
Form,
Sense,
Cit,
}
pub struct Reader<B: BufRead> {
inner: XmlReader<B>,
base_path: Option<PathBuf>,
include_handler: Box<dyn Fn(&Path, Option<&Path>) -> Vec<u8>>,
state: State,
language: Option<Language>,
}
impl<B: BufRead> Reader<B> {
fn ignore(&mut self, tag: &[u8]) {
let mut buffer = Vec::new();
let mut depth = 0;
loop {
match self.inner.read_event(&mut buffer) {
Ok(Event::Start(_)) => {
depth += 1;
}
Ok(Event::End(ref e)) if e.name() == tag && depth == 0 => {
break;
}
Ok(Event::End(_)) if depth == 0 => {
// This is bad, this shouldn't be possible. Might be a bad xml file.
// Should we panic? Should we return a error? Just break for now...
break;
}
Ok(Event::End(_)) => {
depth -= 1;
}
_ => (),
}
}
}
pub fn base_path<P>(&mut self, path: P)
where
P: Into<PathBuf>,
{
self.base_path = Some(path.into());
}
pub fn include_handler<F>(&mut self, f: F)
where
F: Fn(&Path, Option<&Path>) -> Vec<u8> + 'static,
{
self.include_handler = Box::new(f);
}
}
impl Reader<BufReader<File>> {
pub fn from_path<P>(path: P) -> Reader<BufReader<File>>
where
P: Into<PathBuf>,
{
let path = path.into();
let mut inner = XmlReader::from_file(&path).expect("failed to open path");
inner.trim_text(true).check_end_names(true);
Reader {
inner,
base_path: Some(path),
include_handler: Box::new(|_, _| vec![]),
state: State::None,
language: None,
}
}
}
impl<'a> Reader<&'a [u8]> {
pub fn from_str(s: &str) -> Reader<&[u8]> {
let mut inner = XmlReader::from_str(s);
inner.trim_text(true).check_end_names(true);
Reader {
inner,
base_path: None,
include_handler: Box::new(|_, _| vec![]),
state: State::None,
language: None,
}
}
}
impl<B: BufRead> Iterator for Reader<B> {
type Item = Entry;
fn next(&mut self) -> Option<Self::Item> {
let mut buffer = Vec::new();
let mut entry = None;
let mut sense = None;
loop {
let event = self.inner.read_event(&mut buffer);
match (self.state, event) {
(State::None, Ok(Event::Start(ref e))) => match e.name() {
b"TEI" => self.state = State::Tei,
tag => self.ignore(tag),
},
(State::Tei, Ok(Event::Start(ref e))) => match e.name() {
b"text" => self.state = State::Text,
tag => self.ignore(tag),
},
(State::Tei, Ok(Event::End(ref e))) if e.name() == b"TEI" => {
self.state = State::None;
}
(State::Text, Ok(Event::Start(ref e))) => match e.name() {
b"body" => {
if let Some(lang) = e
.attributes()
.filter_map(Result::ok)
.find(|a| a.key == b"xml:lang")
.and_then(|a| {
Language::from_639_1(
self.inner
.decode(a.value.as_ref())
.expect("expected language")
.as_ref(),
)
})
{
self.language = Some(lang);
}
self.state = State::Body;
}
tag => self.ignore(tag),
},
(State::Text, Ok(Event::End(ref e))) if e.name() == b"text" => {
self.state = State::Tei;
}
(State::Body, Ok(Event::Start(ref e))) => match e.name() {
b"entry" => self.state = State::Entry,
tag => self.ignore(tag),
},
(State::Body, Ok(Event::End(ref e))) if e.name() == b"body" => {
self.state = State::Text;
}
(State::Entry, Ok(Event::Start(ref e))) => match e.name() {
b"form" => self.state = State::Form,
b"sense" => self.state = State::Sense,
tag => self.ignore(tag),
},
(State::Entry, Ok(Event::End(ref e))) if e.name() == b"entry" => {
self.state = State::Body;
if entry.is_some() {
return entry;
}
}
(State::Form, Ok(Event::Start(ref e))) => match e.name() {
b"orth" => {
if let Ok(word) = self.inner.read_text(e.name(), &mut Vec::new()) {
entry = Some(Entry {
lang: self.language,
orth: word,
sense: Vec::new(),
});
}
self.state = State::Form;
}
tag => self.ignore(tag),
},
(State::Form, Ok(Event::End(ref e))) if e.name() == b"form" => {
self.state = State::Entry
}
(State::Sense, Ok(Event::Start(ref e))) => match e.name() {
b"cit" => {
if e.attributes()
.filter_map(Result::ok)
.any(|a| a.key == b"type" && a.value.as_ref() == b"trans")
{
let language = e
.attributes()
.filter_map(Result::ok)
.find(|a| a.key == b"xml:lang")
.and_then(|a| {
Language::from_639_1(
self.inner
.decode(a.value.as_ref())
.expect("expected language")
.as_ref(),
)
});
sense = Some(Sense {
lang: language,
quotes: Vec::new(),
});
self.state = State::Cit;
} else {
self.ignore(b"cit");
}
}
tag => self.ignore(tag),
},
(State::Sense, Ok(Event::End(ref e))) if e.name() == b"sense" => {
self.state = State::Entry
}
(State::Cit, Ok(Event::Start(ref e))) => match e.name() {
b"quote" => {
if let Ok(word) = self.inner.read_text(e.name(), &mut Vec::new()) {
sense.as_mut().map(|sense| sense.quotes.push(word));
}
}
tag => self.ignore(tag),
},
(State::Cit, Ok(Event::End(ref e))) if e.name() == b"cit" => {
if let Some(sense) = sense.take() {
entry.as_mut().map(|entry| entry.sense.push(sense));
}
self.state = State::Sense
}
(_, Ok(Event::Eof)) => break,
(_, Err(_)) => break,
(_, _) => (),
}
buffer.clear();
}
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_reader() {
let fixture = include_str!("../data/pol-eng.tei");
let reader = Reader::from_str(&fixture);
// reader.parse();
for entry in reader {
eprintln!("{:#?}", entry);
}
assert!(true == false);
}
#[test]
#[ignore]
fn test_include_handler() {
let mut reader = Reader::from_path("data/pol-eng.tei");
reader.include_handler(|path, _| {
println!("path: {:?}", path);
vec![]
});
// reader.parse();
assert!(true == false);
}
}