Added WIP crate for wiktionary.

This commit is contained in:
2019-09-27 09:45:06 +02:00
parent 22fa3e95c3
commit 9f7d575a68
6 changed files with 307 additions and 170 deletions

View File

@@ -0,0 +1,10 @@
[package]
name = "dict-wiktionary"
version = "0.1.0"
authors = ["Anders Olsson <anders.e.olsson@gmail.com>"]
edition = "2018"
[dependencies]
bzip2 = "0.3"
log = "0.4"
quick-xml = "0.16"

256
dict-wiktionary/src/lib.rs Normal file
View File

@@ -0,0 +1,256 @@
use std::io::BufRead;
use log::*;
use quick_xml::events::Event;
use quick_xml::Reader;
#[derive(Clone, Copy, Debug)]
enum State {
None,
MediaWiki,
Page,
Title,
Namespace,
Revision,
Text,
}
pub struct PageReader<B: BufRead> {
reader: Reader<B>,
state: State,
}
impl<B: BufRead> PageReader<B> {
pub fn from_reader(reader: B) -> PageReader<B> {
let mut reader = Reader::from_reader(reader);
reader
.expand_empty_elements(true)
.trim_text(false)
.check_end_names(true);
PageReader {
reader,
state: State::None,
}
}
fn ignore(&mut self, tag: &[u8]) {
let mut buffer = Vec::with_capacity(256);
let mut depth = 0;
loop {
match self.reader.read_event(&mut buffer) {
Ok(Event::Start(_)) => {
depth += 1;
}
Ok(Event::End(ref e)) if e.name() == tag && depth == 0 => {
break;
}
Ok(Event::End(_)) if depth == 0 => {
// This is bad, this shouldn't be possible. Might be a bad xml file.
// Should we panic? Should we return a error? Just break for now...
break;
}
Ok(Event::End(_)) => {
depth -= 1;
}
Err(e) => {
// self.state = State::MediaWiki;
debug!(
"ignore: error at position {}: {:?}",
self.reader.buffer_position(),
e
);
break;
}
_ => (),
}
buffer.clear();
}
}
}
impl<B: BufRead> Iterator for PageReader<B> {
type Item = (String, String);
fn next(&mut self) -> Option<Self::Item> {
let mut title = None;
let mut buffer = Vec::with_capacity(265);
loop {
let event = self.reader.read_event(&mut buffer);
match event {
Ok(Event::Start(_)) => trace!("state={:#?}, event=Ok(Event::Start(_))", self.state),
Ok(Event::End(_)) => trace!("state={:#?}, event=Ok(Event::End(_))", self.state),
Ok(Event::Text(_)) => trace!("state={:#?}, event=Ok(Event::Text(_))", self.state),
Ok(Event::Empty(_)) => trace!("state={:#?}, event=Ok(Event::Empty(_))", self.state),
_ => trace!("state={:#?}, event={:#?}", self.state, event),
}
match (self.state, event) {
(State::None, Ok(Event::Start(ref e))) => match e.name() {
b"mediawiki" => self.state = State::MediaWiki,
tag => self.ignore(tag),
},
(State::MediaWiki, Ok(Event::Start(ref e))) => match e.name() {
b"page" => self.state = State::Page,
tag => self.ignore(tag),
},
(State::MediaWiki, Ok(Event::End(ref e))) if e.name() == b"mediawiki" => {
self.state = State::None;
}
(State::MediaWiki, Err(e)) => {
self.state = State::MediaWiki;
debug!(
"{:?}: error at position {}: {:?}",
self.state,
self.reader.buffer_position(),
e
);
}
(State::Page, Ok(Event::Start(ref e))) => match e.name() {
b"title" => self.state = State::Title,
b"ns" => self.state = State::Namespace,
b"revision" => self.state = State::Revision,
tag => self.ignore(tag),
},
(State::Page, Ok(Event::End(ref e))) if e.name() == b"page" => {
self.state = State::MediaWiki;
}
(State::Page, Err(e)) => {
self.state = State::MediaWiki;
debug!(
"{:?}: error at position {}: {:?}",
self.state,
self.reader.buffer_position(),
e
);
}
(State::Title, Ok(Event::Text(e))) => {
if let Ok(text) = e.unescape_and_decode(&self.reader) {
title = Some(text);
}
}
(State::Title, Ok(Event::End(ref e))) if e.name() == b"title" => {
self.state = State::Page;
}
(State::Title, Err(e)) => {
self.state = State::Page;
debug!(
"{:?}: error at position {}: {:?}",
self.state,
self.reader.buffer_position(),
e
);
}
(State::Namespace, Ok(Event::Text(e))) => {
if let Ok(text) = e.unescape_and_decode(&self.reader) {
if text != "0" {
self.state = State::MediaWiki;
}
}
}
(State::Namespace, Ok(Event::End(ref e))) if e.name() == b"ns" => {
self.state = State::Page;
}
(State::Namespace, Err(e)) => {
self.state = State::Page;
debug!(
"{:?}: error at position {}: {:?}",
self.state,
self.reader.buffer_position(),
e
);
}
(State::Revision, Ok(Event::Start(ref e))) => match e.name() {
b"text" => self.state = State::Text,
tag => self.ignore(tag),
},
(State::Revision, Ok(Event::End(ref e))) if e.name() == b"revision" => {
self.state = State::Page;
}
(State::Revision, Err(e)) => {
self.state = State::Page;
debug!(
"{:?}: error at position {}: {:?}",
self.state,
self.reader.buffer_position(),
e
);
}
(State::Text, Ok(Event::Text(e))) => {
if let Ok(text) = e.unescape_and_decode(&self.reader) {
return Some((title.unwrap_or_else(|| "".to_string()), text));
}
}
(State::Text, Ok(Event::End(ref e))) if e.name() == b"text" => {
self.state = State::Revision;
}
(State::Text, Err(_)) => {
self.state = State::Revision;
}
(_, Ok(Event::Eof)) => break,
(_, Err(e)) => {
self.state = State::MediaWiki;
debug!(
"{:?}: error at position {}: {:?}",
self.state,
self.reader.buffer_position(),
e
);
}
_ => (),
}
buffer.clear();
}
None
}
}
#[cfg(test)]
mod tests {
use std::fs;
use std::io::BufReader;
use bzip2::read::BzDecoder;
use super::*;
#[test]
fn it_works() {
let reader = fs::File::open("/Users/olsson/Laboratory/denc-v2/corpus/wiktionary/svwiktionary-20190920-pages-articles.xml.bz2")
.map(BzDecoder::new)
.map(BufReader::new)
.expect("failed to open path");
let page_reader = PageReader::from_reader(reader);
for (title, _) in page_reader {
println!("{}", title);
}
assert!(false);
}
}