diff --git a/dict-tei/src/lib.rs b/dict-tei/src/lib.rs index 2512831..77428ab 100644 --- a/dict-tei/src/lib.rs +++ b/dict-tei/src/lib.rs @@ -2,32 +2,69 @@ use std::fs::File; use std::io::{BufRead, BufReader}; use std::path::{Path, PathBuf}; -use dict::{Entry, Language, Translation}; +use dict::{Entry, Language, Sense}; use quick_xml::events::Event; use quick_xml::Reader as XmlReader; -trait Visitor { - fn visit_start(&mut self, event: &Event); -} - +#[derive(Clone, Copy)] enum State { None, + Tei, Text, - Body(Language), - Entry(Language), - Form(Language), - Orth(Entry), - Sense(Entry), - Cit(Entry, Language), - Quote(Entry, Language), + Body, + Entry, + Form, + Sense, + Cit, } pub struct Reader { inner: XmlReader, base_path: Option, include_handler: Box) -> Vec>, - buffer: Vec, state: State, + language: Option, +} + +impl Reader { + fn ignore(&mut self, tag: &[u8]) { + let mut buffer = Vec::new(); + let mut depth = 0; + + loop { + match self.inner.read_event(&mut buffer) { + Ok(Event::Start(_)) => { + depth += 1; + } + Ok(Event::End(ref e)) if e.name() == tag && depth == 0 => { + break; + } + Ok(Event::End(_)) if depth == 0 => { + // This is bad, this shouldn't be possible. Might be a bad xml file. + // Should we panic? Should we return a error? Just break for now... + break; + } + Ok(Event::End(_)) => { + depth -= 1; + } + _ => (), + } + } + } + + pub fn base_path

(&mut self, path: P) + where + P: Into, + { + self.base_path = Some(path.into()); + } + + pub fn include_handler(&mut self, f: F) + where + F: Fn(&Path, Option<&Path>) -> Vec + 'static, + { + self.include_handler = Box::new(f); + } } impl Reader> { @@ -39,14 +76,14 @@ impl Reader> { let mut inner = XmlReader::from_file(&path).expect("failed to open path"); - inner.trim_text(true); + inner.trim_text(true).check_end_names(true); Reader { inner, base_path: Some(path), include_handler: Box::new(|_, _| vec![]), - buffer: Vec::new(), state: State::None, + language: None, } } } @@ -55,14 +92,14 @@ impl<'a> Reader<&'a [u8]> { pub fn from_str(s: &str) -> Reader<&[u8]> { let mut inner = XmlReader::from_str(s); - inner.trim_text(true); + inner.trim_text(true).check_end_names(true); Reader { inner, base_path: None, include_handler: Box::new(|_, _| vec![]), - buffer: Vec::new(), state: State::None, + language: None, } } } @@ -71,6 +108,144 @@ impl Iterator for Reader { type Item = Entry; fn next(&mut self) -> Option { + let mut buffer = Vec::new(); + let mut entry = None; + let mut sense = None; + + loop { + let event = self.inner.read_event(&mut buffer); + + match (self.state, event) { + (State::None, Ok(Event::Start(ref e))) => match e.name() { + b"TEI" => self.state = State::Tei, + tag => self.ignore(tag), + }, + + (State::Tei, Ok(Event::Start(ref e))) => match e.name() { + b"text" => self.state = State::Text, + tag => self.ignore(tag), + }, + (State::Tei, Ok(Event::End(ref e))) if e.name() == b"TEI" => { + self.state = State::None; + } + + (State::Text, Ok(Event::Start(ref e))) => match e.name() { + b"body" => { + if let Some(lang) = e + .attributes() + .filter_map(Result::ok) + .find(|a| a.key == b"xml:lang") + .and_then(|a| { + Language::from_639_1(self.inner.decode(a.value.as_ref()).as_ref()) + }) + { + self.language = Some(lang); + } + + self.state = State::Body; + } + tag => self.ignore(tag), + }, + (State::Text, Ok(Event::End(ref e))) if e.name() == b"text" => { + self.state = State::Tei; + } + + (State::Body, Ok(Event::Start(ref e))) => match e.name() { + b"entry" => self.state = State::Entry, + tag => self.ignore(tag), + }, + (State::Body, Ok(Event::End(ref e))) if e.name() == b"body" => { + self.state = State::Text; + } + + (State::Entry, Ok(Event::Start(ref e))) => match e.name() { + b"form" => self.state = State::Form, + b"sense" => self.state = State::Sense, + tag => self.ignore(tag), + }, + (State::Entry, Ok(Event::End(ref e))) if e.name() == b"entry" => { + self.state = State::Body; + + if entry.is_some() { + return entry; + } + } + + (State::Form, Ok(Event::Start(ref e))) => match e.name() { + b"orth" => { + if let Ok(word) = self.inner.read_text(e.name(), &mut Vec::new()) { + entry = Some(Entry { + lang: self.language, + orth: word, + sense: Vec::new(), + }); + } + + self.state = State::Form; + } + tag => self.ignore(tag), + }, + (State::Form, Ok(Event::End(ref e))) if e.name() == b"form" => { + self.state = State::Entry + } + + (State::Sense, Ok(Event::Start(ref e))) => match e.name() { + b"cit" => { + if e.attributes() + .filter_map(Result::ok) + .any(|a| a.key == b"type" && a.value.as_ref() == b"trans") + { + let language = e + .attributes() + .filter_map(Result::ok) + .find(|a| a.key == b"xml:lang") + .and_then(|a| { + Language::from_639_1( + self.inner.decode(a.value.as_ref()).as_ref(), + ) + }); + + sense = Some(Sense { + lang: language, + quotes: Vec::new(), + }); + + self.state = State::Cit; + } else { + self.ignore(b"cit"); + } + } + tag => self.ignore(tag), + }, + (State::Sense, Ok(Event::End(ref e))) if e.name() == b"sense" => { + self.state = State::Entry + } + + (State::Cit, Ok(Event::Start(ref e))) => match e.name() { + b"quote" => { + if let Ok(word) = self.inner.read_text(e.name(), &mut Vec::new()) { + sense.as_mut().map(|sense| sense.quotes.push(word)); + } + } + tag => self.ignore(tag), + }, + (State::Cit, Ok(Event::End(ref e))) if e.name() == b"cit" => { + if let Some(sense) = sense.take() { + entry.as_mut().map(|entry| entry.sense.push(sense)); + } + + self.state = State::Sense + } + + (_, Ok(Event::Eof)) => break, + (_, Err(_)) => break, + + (_, _) => (), + } + + buffer.clear(); + } + None } } @@ -133,20 +308,6 @@ impl Reader { println!("{:#?}", words); } - - pub fn base_path

(&mut self, path: P) - where - P: Into, - { - self.base_path = Some(path.into()); - } - - pub fn include_handler(&mut self, f: F) - where - F: Fn(&Path, Option<&Path>) -> Vec + 'static, - { - self.include_handler = Box::new(f); - } } #[cfg(test)] @@ -157,14 +318,19 @@ mod tests { fn test_reader() { let fixture = include_str!("../data/pol-eng.tei"); - let mut reader = Reader::from_str(&fixture); + let reader = Reader::from_str(&fixture); - reader.parse(); + // reader.parse(); + + for entry in reader { + eprintln!("{:#?}", entry); + } assert!(true == false); } #[test] + #[ignore] fn test_include_handler() { let mut reader = Reader::from_path("data/pol-eng.tei"); diff --git a/dict/src/lib.rs b/dict/src/lib.rs index 37ac54e..6fd4c18 100644 --- a/dict/src/lib.rs +++ b/dict/src/lib.rs @@ -1,12 +1,14 @@ pub use isolang::Language; +#[derive(Debug)] pub struct Entry { - pub language: Option, - pub orthographic: String, - pub translation: Vec, + pub lang: Option, + pub orth: String, + pub sense: Vec, } -pub struct Translation { - pub language: Option, - pub translations: Vec, +#[derive(Debug)] +pub struct Sense { + pub lang: Option, + pub quotes: Vec, }