Added WIP crate for wiktionary.
This commit is contained in:
10
dict-wiktionary/Cargo.toml
Normal file
10
dict-wiktionary/Cargo.toml
Normal file
@@ -0,0 +1,10 @@
|
||||
[package]
|
||||
name = "dict-wiktionary"
|
||||
version = "0.1.0"
|
||||
authors = ["Anders Olsson <anders.e.olsson@gmail.com>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
bzip2 = "0.3"
|
||||
log = "0.4"
|
||||
quick-xml = "0.16"
|
||||
256
dict-wiktionary/src/lib.rs
Normal file
256
dict-wiktionary/src/lib.rs
Normal file
@@ -0,0 +1,256 @@
|
||||
use std::io::BufRead;
|
||||
|
||||
use log::*;
|
||||
use quick_xml::events::Event;
|
||||
use quick_xml::Reader;
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
enum State {
|
||||
None,
|
||||
MediaWiki,
|
||||
Page,
|
||||
Title,
|
||||
Namespace,
|
||||
Revision,
|
||||
Text,
|
||||
}
|
||||
|
||||
pub struct PageReader<B: BufRead> {
|
||||
reader: Reader<B>,
|
||||
state: State,
|
||||
}
|
||||
|
||||
impl<B: BufRead> PageReader<B> {
|
||||
pub fn from_reader(reader: B) -> PageReader<B> {
|
||||
let mut reader = Reader::from_reader(reader);
|
||||
|
||||
reader
|
||||
.expand_empty_elements(true)
|
||||
.trim_text(false)
|
||||
.check_end_names(true);
|
||||
|
||||
PageReader {
|
||||
reader,
|
||||
state: State::None,
|
||||
}
|
||||
}
|
||||
|
||||
fn ignore(&mut self, tag: &[u8]) {
|
||||
let mut buffer = Vec::with_capacity(256);
|
||||
let mut depth = 0;
|
||||
|
||||
loop {
|
||||
match self.reader.read_event(&mut buffer) {
|
||||
Ok(Event::Start(_)) => {
|
||||
depth += 1;
|
||||
}
|
||||
Ok(Event::End(ref e)) if e.name() == tag && depth == 0 => {
|
||||
break;
|
||||
}
|
||||
Ok(Event::End(_)) if depth == 0 => {
|
||||
// This is bad, this shouldn't be possible. Might be a bad xml file.
|
||||
// Should we panic? Should we return a error? Just break for now...
|
||||
break;
|
||||
}
|
||||
Ok(Event::End(_)) => {
|
||||
depth -= 1;
|
||||
}
|
||||
Err(e) => {
|
||||
// self.state = State::MediaWiki;
|
||||
|
||||
debug!(
|
||||
"ignore: error at position {}: {:?}",
|
||||
self.reader.buffer_position(),
|
||||
e
|
||||
);
|
||||
|
||||
break;
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
|
||||
buffer.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<B: BufRead> Iterator for PageReader<B> {
|
||||
type Item = (String, String);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let mut title = None;
|
||||
let mut buffer = Vec::with_capacity(265);
|
||||
|
||||
loop {
|
||||
let event = self.reader.read_event(&mut buffer);
|
||||
|
||||
match event {
|
||||
Ok(Event::Start(_)) => trace!("state={:#?}, event=Ok(Event::Start(_))", self.state),
|
||||
Ok(Event::End(_)) => trace!("state={:#?}, event=Ok(Event::End(_))", self.state),
|
||||
Ok(Event::Text(_)) => trace!("state={:#?}, event=Ok(Event::Text(_))", self.state),
|
||||
Ok(Event::Empty(_)) => trace!("state={:#?}, event=Ok(Event::Empty(_))", self.state),
|
||||
_ => trace!("state={:#?}, event={:#?}", self.state, event),
|
||||
}
|
||||
|
||||
match (self.state, event) {
|
||||
(State::None, Ok(Event::Start(ref e))) => match e.name() {
|
||||
b"mediawiki" => self.state = State::MediaWiki,
|
||||
tag => self.ignore(tag),
|
||||
},
|
||||
|
||||
(State::MediaWiki, Ok(Event::Start(ref e))) => match e.name() {
|
||||
b"page" => self.state = State::Page,
|
||||
tag => self.ignore(tag),
|
||||
},
|
||||
(State::MediaWiki, Ok(Event::End(ref e))) if e.name() == b"mediawiki" => {
|
||||
self.state = State::None;
|
||||
}
|
||||
(State::MediaWiki, Err(e)) => {
|
||||
self.state = State::MediaWiki;
|
||||
|
||||
debug!(
|
||||
"{:?}: error at position {}: {:?}",
|
||||
self.state,
|
||||
self.reader.buffer_position(),
|
||||
e
|
||||
);
|
||||
}
|
||||
|
||||
(State::Page, Ok(Event::Start(ref e))) => match e.name() {
|
||||
b"title" => self.state = State::Title,
|
||||
b"ns" => self.state = State::Namespace,
|
||||
b"revision" => self.state = State::Revision,
|
||||
tag => self.ignore(tag),
|
||||
},
|
||||
(State::Page, Ok(Event::End(ref e))) if e.name() == b"page" => {
|
||||
self.state = State::MediaWiki;
|
||||
}
|
||||
(State::Page, Err(e)) => {
|
||||
self.state = State::MediaWiki;
|
||||
|
||||
debug!(
|
||||
"{:?}: error at position {}: {:?}",
|
||||
self.state,
|
||||
self.reader.buffer_position(),
|
||||
e
|
||||
);
|
||||
}
|
||||
|
||||
(State::Title, Ok(Event::Text(e))) => {
|
||||
if let Ok(text) = e.unescape_and_decode(&self.reader) {
|
||||
title = Some(text);
|
||||
}
|
||||
}
|
||||
(State::Title, Ok(Event::End(ref e))) if e.name() == b"title" => {
|
||||
self.state = State::Page;
|
||||
}
|
||||
(State::Title, Err(e)) => {
|
||||
self.state = State::Page;
|
||||
|
||||
debug!(
|
||||
"{:?}: error at position {}: {:?}",
|
||||
self.state,
|
||||
self.reader.buffer_position(),
|
||||
e
|
||||
);
|
||||
}
|
||||
|
||||
(State::Namespace, Ok(Event::Text(e))) => {
|
||||
if let Ok(text) = e.unescape_and_decode(&self.reader) {
|
||||
if text != "0" {
|
||||
self.state = State::MediaWiki;
|
||||
}
|
||||
}
|
||||
}
|
||||
(State::Namespace, Ok(Event::End(ref e))) if e.name() == b"ns" => {
|
||||
self.state = State::Page;
|
||||
}
|
||||
(State::Namespace, Err(e)) => {
|
||||
self.state = State::Page;
|
||||
|
||||
debug!(
|
||||
"{:?}: error at position {}: {:?}",
|
||||
self.state,
|
||||
self.reader.buffer_position(),
|
||||
e
|
||||
);
|
||||
}
|
||||
|
||||
(State::Revision, Ok(Event::Start(ref e))) => match e.name() {
|
||||
b"text" => self.state = State::Text,
|
||||
tag => self.ignore(tag),
|
||||
},
|
||||
(State::Revision, Ok(Event::End(ref e))) if e.name() == b"revision" => {
|
||||
self.state = State::Page;
|
||||
}
|
||||
(State::Revision, Err(e)) => {
|
||||
self.state = State::Page;
|
||||
|
||||
debug!(
|
||||
"{:?}: error at position {}: {:?}",
|
||||
self.state,
|
||||
self.reader.buffer_position(),
|
||||
e
|
||||
);
|
||||
}
|
||||
|
||||
(State::Text, Ok(Event::Text(e))) => {
|
||||
if let Ok(text) = e.unescape_and_decode(&self.reader) {
|
||||
return Some((title.unwrap_or_else(|| "".to_string()), text));
|
||||
}
|
||||
}
|
||||
(State::Text, Ok(Event::End(ref e))) if e.name() == b"text" => {
|
||||
self.state = State::Revision;
|
||||
}
|
||||
(State::Text, Err(_)) => {
|
||||
self.state = State::Revision;
|
||||
}
|
||||
|
||||
(_, Ok(Event::Eof)) => break,
|
||||
|
||||
(_, Err(e)) => {
|
||||
self.state = State::MediaWiki;
|
||||
|
||||
debug!(
|
||||
"{:?}: error at position {}: {:?}",
|
||||
self.state,
|
||||
self.reader.buffer_position(),
|
||||
e
|
||||
);
|
||||
}
|
||||
|
||||
_ => (),
|
||||
}
|
||||
|
||||
buffer.clear();
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::fs;
|
||||
use std::io::BufReader;
|
||||
|
||||
use bzip2::read::BzDecoder;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn it_works() {
|
||||
let reader = fs::File::open("/Users/olsson/Laboratory/denc-v2/corpus/wiktionary/svwiktionary-20190920-pages-articles.xml.bz2")
|
||||
.map(BzDecoder::new)
|
||||
.map(BufReader::new)
|
||||
.expect("failed to open path");
|
||||
|
||||
let page_reader = PageReader::from_reader(reader);
|
||||
|
||||
for (title, _) in page_reader {
|
||||
println!("{}", title);
|
||||
}
|
||||
|
||||
assert!(false);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user