Got it working again.
This commit is contained in:
@@ -2,32 +2,69 @@ use std::fs::File;
|
|||||||
use std::io::{BufRead, BufReader};
|
use std::io::{BufRead, BufReader};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
use dict::{Entry, Language, Translation};
|
use dict::{Entry, Language, Sense};
|
||||||
use quick_xml::events::Event;
|
use quick_xml::events::Event;
|
||||||
use quick_xml::Reader as XmlReader;
|
use quick_xml::Reader as XmlReader;
|
||||||
|
|
||||||
trait Visitor {
|
#[derive(Clone, Copy)]
|
||||||
fn visit_start(&mut self, event: &Event);
|
|
||||||
}
|
|
||||||
|
|
||||||
enum State {
|
enum State {
|
||||||
None,
|
None,
|
||||||
|
Tei,
|
||||||
Text,
|
Text,
|
||||||
Body(Language),
|
Body,
|
||||||
Entry(Language),
|
Entry,
|
||||||
Form(Language),
|
Form,
|
||||||
Orth(Entry),
|
Sense,
|
||||||
Sense(Entry),
|
Cit,
|
||||||
Cit(Entry, Language),
|
|
||||||
Quote(Entry, Language),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Reader<B: BufRead> {
|
pub struct Reader<B: BufRead> {
|
||||||
inner: XmlReader<B>,
|
inner: XmlReader<B>,
|
||||||
base_path: Option<PathBuf>,
|
base_path: Option<PathBuf>,
|
||||||
include_handler: Box<dyn Fn(&Path, Option<&Path>) -> Vec<u8>>,
|
include_handler: Box<dyn Fn(&Path, Option<&Path>) -> Vec<u8>>,
|
||||||
buffer: Vec<u8>,
|
|
||||||
state: State,
|
state: State,
|
||||||
|
language: Option<Language>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<B: BufRead> Reader<B> {
|
||||||
|
fn ignore(&mut self, tag: &[u8]) {
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
let mut depth = 0;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
match self.inner.read_event(&mut buffer) {
|
||||||
|
Ok(Event::Start(_)) => {
|
||||||
|
depth += 1;
|
||||||
|
}
|
||||||
|
Ok(Event::End(ref e)) if e.name() == tag && depth == 0 => {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Ok(Event::End(_)) if depth == 0 => {
|
||||||
|
// This is bad, this shouldn't be possible. Might be a bad xml file.
|
||||||
|
// Should we panic? Should we return a error? Just break for now...
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Ok(Event::End(_)) => {
|
||||||
|
depth -= 1;
|
||||||
|
}
|
||||||
|
_ => (),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn base_path<P>(&mut self, path: P)
|
||||||
|
where
|
||||||
|
P: Into<PathBuf>,
|
||||||
|
{
|
||||||
|
self.base_path = Some(path.into());
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn include_handler<F>(&mut self, f: F)
|
||||||
|
where
|
||||||
|
F: Fn(&Path, Option<&Path>) -> Vec<u8> + 'static,
|
||||||
|
{
|
||||||
|
self.include_handler = Box::new(f);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Reader<BufReader<File>> {
|
impl Reader<BufReader<File>> {
|
||||||
@@ -39,14 +76,14 @@ impl Reader<BufReader<File>> {
|
|||||||
|
|
||||||
let mut inner = XmlReader::from_file(&path).expect("failed to open path");
|
let mut inner = XmlReader::from_file(&path).expect("failed to open path");
|
||||||
|
|
||||||
inner.trim_text(true);
|
inner.trim_text(true).check_end_names(true);
|
||||||
|
|
||||||
Reader {
|
Reader {
|
||||||
inner,
|
inner,
|
||||||
base_path: Some(path),
|
base_path: Some(path),
|
||||||
include_handler: Box::new(|_, _| vec![]),
|
include_handler: Box::new(|_, _| vec![]),
|
||||||
buffer: Vec::new(),
|
|
||||||
state: State::None,
|
state: State::None,
|
||||||
|
language: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -55,14 +92,14 @@ impl<'a> Reader<&'a [u8]> {
|
|||||||
pub fn from_str(s: &str) -> Reader<&[u8]> {
|
pub fn from_str(s: &str) -> Reader<&[u8]> {
|
||||||
let mut inner = XmlReader::from_str(s);
|
let mut inner = XmlReader::from_str(s);
|
||||||
|
|
||||||
inner.trim_text(true);
|
inner.trim_text(true).check_end_names(true);
|
||||||
|
|
||||||
Reader {
|
Reader {
|
||||||
inner,
|
inner,
|
||||||
base_path: None,
|
base_path: None,
|
||||||
include_handler: Box::new(|_, _| vec![]),
|
include_handler: Box::new(|_, _| vec![]),
|
||||||
buffer: Vec::new(),
|
|
||||||
state: State::None,
|
state: State::None,
|
||||||
|
language: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -71,6 +108,144 @@ impl<B: BufRead> Iterator for Reader<B> {
|
|||||||
type Item = Entry;
|
type Item = Entry;
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
let mut entry = None;
|
||||||
|
let mut sense = None;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let event = self.inner.read_event(&mut buffer);
|
||||||
|
|
||||||
|
match (self.state, event) {
|
||||||
|
(State::None, Ok(Event::Start(ref e))) => match e.name() {
|
||||||
|
b"TEI" => self.state = State::Tei,
|
||||||
|
tag => self.ignore(tag),
|
||||||
|
},
|
||||||
|
|
||||||
|
(State::Tei, Ok(Event::Start(ref e))) => match e.name() {
|
||||||
|
b"text" => self.state = State::Text,
|
||||||
|
tag => self.ignore(tag),
|
||||||
|
},
|
||||||
|
(State::Tei, Ok(Event::End(ref e))) if e.name() == b"TEI" => {
|
||||||
|
self.state = State::None;
|
||||||
|
}
|
||||||
|
|
||||||
|
(State::Text, Ok(Event::Start(ref e))) => match e.name() {
|
||||||
|
b"body" => {
|
||||||
|
if let Some(lang) = e
|
||||||
|
.attributes()
|
||||||
|
.filter_map(Result::ok)
|
||||||
|
.find(|a| a.key == b"xml:lang")
|
||||||
|
.and_then(|a| {
|
||||||
|
Language::from_639_1(self.inner.decode(a.value.as_ref()).as_ref())
|
||||||
|
})
|
||||||
|
{
|
||||||
|
self.language = Some(lang);
|
||||||
|
}
|
||||||
|
|
||||||
|
self.state = State::Body;
|
||||||
|
}
|
||||||
|
tag => self.ignore(tag),
|
||||||
|
},
|
||||||
|
(State::Text, Ok(Event::End(ref e))) if e.name() == b"text" => {
|
||||||
|
self.state = State::Tei;
|
||||||
|
}
|
||||||
|
|
||||||
|
(State::Body, Ok(Event::Start(ref e))) => match e.name() {
|
||||||
|
b"entry" => self.state = State::Entry,
|
||||||
|
tag => self.ignore(tag),
|
||||||
|
},
|
||||||
|
(State::Body, Ok(Event::End(ref e))) if e.name() == b"body" => {
|
||||||
|
self.state = State::Text;
|
||||||
|
}
|
||||||
|
|
||||||
|
(State::Entry, Ok(Event::Start(ref e))) => match e.name() {
|
||||||
|
b"form" => self.state = State::Form,
|
||||||
|
b"sense" => self.state = State::Sense,
|
||||||
|
tag => self.ignore(tag),
|
||||||
|
},
|
||||||
|
(State::Entry, Ok(Event::End(ref e))) if e.name() == b"entry" => {
|
||||||
|
self.state = State::Body;
|
||||||
|
|
||||||
|
if entry.is_some() {
|
||||||
|
return entry;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
(State::Form, Ok(Event::Start(ref e))) => match e.name() {
|
||||||
|
b"orth" => {
|
||||||
|
if let Ok(word) = self.inner.read_text(e.name(), &mut Vec::new()) {
|
||||||
|
entry = Some(Entry {
|
||||||
|
lang: self.language,
|
||||||
|
orth: word,
|
||||||
|
sense: Vec::new(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
self.state = State::Form;
|
||||||
|
}
|
||||||
|
tag => self.ignore(tag),
|
||||||
|
},
|
||||||
|
(State::Form, Ok(Event::End(ref e))) if e.name() == b"form" => {
|
||||||
|
self.state = State::Entry
|
||||||
|
}
|
||||||
|
|
||||||
|
(State::Sense, Ok(Event::Start(ref e))) => match e.name() {
|
||||||
|
b"cit" => {
|
||||||
|
if e.attributes()
|
||||||
|
.filter_map(Result::ok)
|
||||||
|
.any(|a| a.key == b"type" && a.value.as_ref() == b"trans")
|
||||||
|
{
|
||||||
|
let language = e
|
||||||
|
.attributes()
|
||||||
|
.filter_map(Result::ok)
|
||||||
|
.find(|a| a.key == b"xml:lang")
|
||||||
|
.and_then(|a| {
|
||||||
|
Language::from_639_1(
|
||||||
|
self.inner.decode(a.value.as_ref()).as_ref(),
|
||||||
|
)
|
||||||
|
});
|
||||||
|
|
||||||
|
sense = Some(Sense {
|
||||||
|
lang: language,
|
||||||
|
quotes: Vec::new(),
|
||||||
|
});
|
||||||
|
|
||||||
|
self.state = State::Cit;
|
||||||
|
} else {
|
||||||
|
self.ignore(b"cit");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tag => self.ignore(tag),
|
||||||
|
},
|
||||||
|
(State::Sense, Ok(Event::End(ref e))) if e.name() == b"sense" => {
|
||||||
|
self.state = State::Entry
|
||||||
|
}
|
||||||
|
|
||||||
|
(State::Cit, Ok(Event::Start(ref e))) => match e.name() {
|
||||||
|
b"quote" => {
|
||||||
|
if let Ok(word) = self.inner.read_text(e.name(), &mut Vec::new()) {
|
||||||
|
sense.as_mut().map(|sense| sense.quotes.push(word));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tag => self.ignore(tag),
|
||||||
|
},
|
||||||
|
(State::Cit, Ok(Event::End(ref e))) if e.name() == b"cit" => {
|
||||||
|
if let Some(sense) = sense.take() {
|
||||||
|
entry.as_mut().map(|entry| entry.sense.push(sense));
|
||||||
|
}
|
||||||
|
|
||||||
|
self.state = State::Sense
|
||||||
|
}
|
||||||
|
|
||||||
|
(_, Ok(Event::Eof)) => break,
|
||||||
|
(_, Err(_)) => break,
|
||||||
|
|
||||||
|
(_, _) => (),
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer.clear();
|
||||||
|
}
|
||||||
|
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -133,20 +308,6 @@ impl<B: BufRead> Reader<B> {
|
|||||||
|
|
||||||
println!("{:#?}", words);
|
println!("{:#?}", words);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn base_path<P>(&mut self, path: P)
|
|
||||||
where
|
|
||||||
P: Into<PathBuf>,
|
|
||||||
{
|
|
||||||
self.base_path = Some(path.into());
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn include_handler<F>(&mut self, f: F)
|
|
||||||
where
|
|
||||||
F: Fn(&Path, Option<&Path>) -> Vec<u8> + 'static,
|
|
||||||
{
|
|
||||||
self.include_handler = Box::new(f);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -157,14 +318,19 @@ mod tests {
|
|||||||
fn test_reader() {
|
fn test_reader() {
|
||||||
let fixture = include_str!("../data/pol-eng.tei");
|
let fixture = include_str!("../data/pol-eng.tei");
|
||||||
|
|
||||||
let mut reader = Reader::from_str(&fixture);
|
let reader = Reader::from_str(&fixture);
|
||||||
|
|
||||||
reader.parse();
|
// reader.parse();
|
||||||
|
|
||||||
|
for entry in reader {
|
||||||
|
eprintln!("{:#?}", entry);
|
||||||
|
}
|
||||||
|
|
||||||
assert!(true == false);
|
assert!(true == false);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
#[ignore]
|
||||||
fn test_include_handler() {
|
fn test_include_handler() {
|
||||||
let mut reader = Reader::from_path("data/pol-eng.tei");
|
let mut reader = Reader::from_path("data/pol-eng.tei");
|
||||||
|
|
||||||
|
|||||||
@@ -1,12 +1,14 @@
|
|||||||
pub use isolang::Language;
|
pub use isolang::Language;
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
pub struct Entry {
|
pub struct Entry {
|
||||||
pub language: Option<Language>,
|
pub lang: Option<Language>,
|
||||||
pub orthographic: String,
|
pub orth: String,
|
||||||
pub translation: Vec<Translation>,
|
pub sense: Vec<Sense>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Translation {
|
#[derive(Debug)]
|
||||||
pub language: Option<Language>,
|
pub struct Sense {
|
||||||
pub translations: Vec<String>,
|
pub lang: Option<Language>,
|
||||||
|
pub quotes: Vec<String>,
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user