Added WIP crate for wiktionary.
This commit is contained in:
130
Cargo.lock
generated
130
Cargo.lock
generated
@@ -1,35 +1,33 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "backtrace"
|
||||
version = "0.3.14"
|
||||
name = "bitflags"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "bzip2"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"backtrace-sys 0.1.28 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"cfg-if 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"bzip2-sys 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"rustc-demangle 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "backtrace-sys"
|
||||
version = "0.1.28"
|
||||
name = "bzip2-sys"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"cc 1.0.31 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.0.31"
|
||||
@@ -60,35 +58,16 @@ name = "dict-tei"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"dict 0.1.0",
|
||||
"quick-xml 0.13.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"quick-xml 0.16.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
version = "0.8.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
name = "dict-wiktionary"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cfg-if 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "failure"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"backtrace 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"failure_derive 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "failure_derive"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"syn 0.15.29 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"quick-xml 0.16.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -120,7 +99,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.2.0"
|
||||
version = "2.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
@@ -157,31 +136,12 @@ dependencies = [
|
||||
"siphasher 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "0.4.27"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-xml"
|
||||
version = "0.13.3"
|
||||
version = "0.16.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "0.6.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -288,42 +248,11 @@ dependencies = [
|
||||
"rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc-demangle"
|
||||
version = "0.1.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "siphasher"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "0.15.29"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "synstructure"
|
||||
version = "0.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"syn 0.15.29 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-xid"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.6"
|
||||
@@ -345,27 +274,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[metadata]
|
||||
"checksum autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a6d640bee2da49f60a4068a7fae53acde8982514ab7bae8b8cea9e88cbcfd799"
|
||||
"checksum backtrace 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)" = "cd5a90e2b463010cd0e0ce9a11d4a9d5d58d9f41d4a6ba3dcaf9e68b466e88b4"
|
||||
"checksum backtrace-sys 0.1.28 (registry+https://github.com/rust-lang/crates.io-index)" = "797c830ac25ccc92a7f8a7b9862bde440715531514594a6154e3d4a54dd769b6"
|
||||
"checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12"
|
||||
"checksum bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "42b7c3cbf0fa9c1b82308d57191728ca0256cb821220f4e2fd410a72ade26e3b"
|
||||
"checksum bzip2-sys 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "6584aa36f5ad4c9247f5323b0a42f37802b37a836f0ad87084d7a33961abe25f"
|
||||
"checksum cc 1.0.31 (registry+https://github.com/rust-lang/crates.io-index)" = "c9ce8bb087aacff865633f0bd5aeaed910fe2fe55b55f4739527f2e023a2e53d"
|
||||
"checksum cfg-if 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "11d43355396e872eefb45ce6342e4374ed7bc2b3a502d1b28e36d6e23c05d1f4"
|
||||
"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
|
||||
"checksum encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)" = "4155785c79f2f6701f185eb2e6b4caf0555ec03477cb4c70db67b465311620ed"
|
||||
"checksum failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "795bd83d3abeb9220f257e597aa0080a508b27533824adf336529648f6abf7e2"
|
||||
"checksum failure_derive 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "ea1063915fd7ef4309e222a5a07cf9c319fb9c7836b1f89b85458672dbb127e1"
|
||||
"checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba"
|
||||
"checksum isolang 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "265ef164908329e47e753c769b14cbb27434abf0c41984dca201484022f09ce5"
|
||||
"checksum libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)" = "aab692d7759f5cd8c859e169db98ae5b52c924add2af5fbbca11d12fefb567c1"
|
||||
"checksum log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c84ec4b527950aa83a329754b01dbe3f58361d1c5efacd1f6d68c494d08a17c6"
|
||||
"checksum memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2efc7bc57c883d4a4d6e3246905283d8dae951bb3bd32f49d6ef297f546e1c39"
|
||||
"checksum memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "88579771288728879b57485cc7d6b07d648c9f0141eb955f8ab7f9d45394468e"
|
||||
"checksum phf 0.7.24 (registry+https://github.com/rust-lang/crates.io-index)" = "b3da44b85f8e8dfaec21adae67f95d93244b2ecf6ad2a692320598dcc8e6dd18"
|
||||
"checksum phf_codegen 0.7.24 (registry+https://github.com/rust-lang/crates.io-index)" = "b03e85129e324ad4166b06b2c7491ae27fe3ec353af72e72cd1654c7225d517e"
|
||||
"checksum phf_generator 0.7.24 (registry+https://github.com/rust-lang/crates.io-index)" = "09364cc93c159b8b06b1f4dd8a4398984503483891b0c26b867cf431fb132662"
|
||||
"checksum phf_shared 0.7.24 (registry+https://github.com/rust-lang/crates.io-index)" = "234f71a15de2288bcb7e3b6515828d22af7ec8598ee6d24c3b526fa0a80b67a0"
|
||||
"checksum proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)" = "4d317f9caece796be1980837fd5cb3dfec5613ebdb04ad0956deea83ce168915"
|
||||
"checksum quick-xml 0.13.3 (registry+https://github.com/rust-lang/crates.io-index)" = "22fcc48ecef4609b243e8c01ff4695d08ee0fc9d5bdbc54630e1a5fe8bb40953"
|
||||
"checksum quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)" = "cdd8e04bd9c52e0342b406469d494fcb033be4bdbe5c606016defbb1681411e1"
|
||||
"checksum quick-xml 0.16.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1cd45021132c1cb5540995e93fcc2cf5a874ef84f9639168fb6819caa023d4be"
|
||||
"checksum rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca"
|
||||
"checksum rand_chacha 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef"
|
||||
"checksum rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b"
|
||||
@@ -377,11 +301,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
"checksum rand_pcg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44"
|
||||
"checksum rand_xorshift 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c"
|
||||
"checksum rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2"
|
||||
"checksum rustc-demangle 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "adacaae16d02b6ec37fdc7acfcddf365978de76d1983d3ee22afc260e1ca9619"
|
||||
"checksum siphasher 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "0b8de496cf83d4ed58b6be86c3a275b8602f6ffe98d3024a869e124147a9a3ac"
|
||||
"checksum syn 0.15.29 (registry+https://github.com/rust-lang/crates.io-index)" = "1825685f977249735d510a242a6727b46efe914bb67e38d30c071b1b72b1d5c2"
|
||||
"checksum synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "73687139bf99285483c96ac0add482c3776528beac1d97d444f6e91f203a2015"
|
||||
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
|
||||
"checksum winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0"
|
||||
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||
|
||||
@@ -2,4 +2,5 @@
|
||||
members = [
|
||||
"dict",
|
||||
"dict-tei",
|
||||
"dict-wiktionary",
|
||||
]
|
||||
|
||||
@@ -5,5 +5,7 @@ authors = ["Anders Olsson <anders.e.olsson@gmail.com>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
quick-xml = "0.13"
|
||||
dict = { path = "../dict" }
|
||||
quick-xml = "0.16"
|
||||
|
||||
[dependencies.dict]
|
||||
path = "../dict"
|
||||
|
||||
@@ -136,7 +136,12 @@ impl<B: BufRead> Iterator for Reader<B> {
|
||||
.filter_map(Result::ok)
|
||||
.find(|a| a.key == b"xml:lang")
|
||||
.and_then(|a| {
|
||||
Language::from_639_1(self.inner.decode(a.value.as_ref()).as_ref())
|
||||
Language::from_639_1(
|
||||
self.inner
|
||||
.decode(a.value.as_ref())
|
||||
.expect("expected language")
|
||||
.as_ref(),
|
||||
)
|
||||
})
|
||||
{
|
||||
self.language = Some(lang);
|
||||
@@ -201,7 +206,10 @@ impl<B: BufRead> Iterator for Reader<B> {
|
||||
.find(|a| a.key == b"xml:lang")
|
||||
.and_then(|a| {
|
||||
Language::from_639_1(
|
||||
self.inner.decode(a.value.as_ref()).as_ref(),
|
||||
self.inner
|
||||
.decode(a.value.as_ref())
|
||||
.expect("expected language")
|
||||
.as_ref(),
|
||||
)
|
||||
});
|
||||
|
||||
@@ -250,66 +258,6 @@ impl<B: BufRead> Iterator for Reader<B> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<B: BufRead> Reader<B> {
|
||||
pub fn parse(&mut self) {
|
||||
enum State {
|
||||
None,
|
||||
Definition(String, Vec<String>),
|
||||
}
|
||||
|
||||
self.inner.trim_text(true);
|
||||
|
||||
let mut buf = Vec::new();
|
||||
|
||||
let mut state = State::None;
|
||||
let mut words = Vec::new();
|
||||
|
||||
loop {
|
||||
state = match (state, self.inner.read_event(&mut buf)) {
|
||||
(State::None, Ok(Event::Start(ref e))) => match e.name() {
|
||||
b"orth" => {
|
||||
let word = self.inner.read_text(e.name(), &mut Vec::new()).unwrap();
|
||||
|
||||
State::Definition(word, Vec::new())
|
||||
}
|
||||
_ => State::None,
|
||||
},
|
||||
(State::Definition(word, mut translations), Ok(Event::Start(e))) => {
|
||||
match e.name() {
|
||||
b"orth" => {
|
||||
words.push((word, translations));
|
||||
|
||||
let word = self.inner.read_text(e.name(), &mut Vec::new()).unwrap();
|
||||
|
||||
State::Definition(word, Vec::new())
|
||||
}
|
||||
b"quote" => {
|
||||
let translation =
|
||||
self.inner.read_text(e.name(), &mut Vec::new()).unwrap();
|
||||
|
||||
translations.push(translation);
|
||||
|
||||
State::Definition(word, translations)
|
||||
}
|
||||
_ => State::Definition(word, translations),
|
||||
}
|
||||
}
|
||||
(_, Err(e)) => panic!(
|
||||
"Error at position {}: {:?}",
|
||||
self.inner.buffer_position(),
|
||||
e
|
||||
),
|
||||
(_, Ok(Event::Eof)) => break,
|
||||
(state, _) => state,
|
||||
};
|
||||
|
||||
buf.clear();
|
||||
}
|
||||
|
||||
println!("{:#?}", words);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -340,7 +288,7 @@ mod tests {
|
||||
vec![]
|
||||
});
|
||||
|
||||
reader.parse();
|
||||
// reader.parse();
|
||||
|
||||
assert!(true == false);
|
||||
}
|
||||
|
||||
10
dict-wiktionary/Cargo.toml
Normal file
10
dict-wiktionary/Cargo.toml
Normal file
@@ -0,0 +1,10 @@
|
||||
[package]
|
||||
name = "dict-wiktionary"
|
||||
version = "0.1.0"
|
||||
authors = ["Anders Olsson <anders.e.olsson@gmail.com>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
bzip2 = "0.3"
|
||||
log = "0.4"
|
||||
quick-xml = "0.16"
|
||||
256
dict-wiktionary/src/lib.rs
Normal file
256
dict-wiktionary/src/lib.rs
Normal file
@@ -0,0 +1,256 @@
|
||||
use std::io::BufRead;
|
||||
|
||||
use log::*;
|
||||
use quick_xml::events::Event;
|
||||
use quick_xml::Reader;
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
enum State {
|
||||
None,
|
||||
MediaWiki,
|
||||
Page,
|
||||
Title,
|
||||
Namespace,
|
||||
Revision,
|
||||
Text,
|
||||
}
|
||||
|
||||
pub struct PageReader<B: BufRead> {
|
||||
reader: Reader<B>,
|
||||
state: State,
|
||||
}
|
||||
|
||||
impl<B: BufRead> PageReader<B> {
|
||||
pub fn from_reader(reader: B) -> PageReader<B> {
|
||||
let mut reader = Reader::from_reader(reader);
|
||||
|
||||
reader
|
||||
.expand_empty_elements(true)
|
||||
.trim_text(false)
|
||||
.check_end_names(true);
|
||||
|
||||
PageReader {
|
||||
reader,
|
||||
state: State::None,
|
||||
}
|
||||
}
|
||||
|
||||
fn ignore(&mut self, tag: &[u8]) {
|
||||
let mut buffer = Vec::with_capacity(256);
|
||||
let mut depth = 0;
|
||||
|
||||
loop {
|
||||
match self.reader.read_event(&mut buffer) {
|
||||
Ok(Event::Start(_)) => {
|
||||
depth += 1;
|
||||
}
|
||||
Ok(Event::End(ref e)) if e.name() == tag && depth == 0 => {
|
||||
break;
|
||||
}
|
||||
Ok(Event::End(_)) if depth == 0 => {
|
||||
// This is bad, this shouldn't be possible. Might be a bad xml file.
|
||||
// Should we panic? Should we return a error? Just break for now...
|
||||
break;
|
||||
}
|
||||
Ok(Event::End(_)) => {
|
||||
depth -= 1;
|
||||
}
|
||||
Err(e) => {
|
||||
// self.state = State::MediaWiki;
|
||||
|
||||
debug!(
|
||||
"ignore: error at position {}: {:?}",
|
||||
self.reader.buffer_position(),
|
||||
e
|
||||
);
|
||||
|
||||
break;
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
|
||||
buffer.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<B: BufRead> Iterator for PageReader<B> {
|
||||
type Item = (String, String);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let mut title = None;
|
||||
let mut buffer = Vec::with_capacity(265);
|
||||
|
||||
loop {
|
||||
let event = self.reader.read_event(&mut buffer);
|
||||
|
||||
match event {
|
||||
Ok(Event::Start(_)) => trace!("state={:#?}, event=Ok(Event::Start(_))", self.state),
|
||||
Ok(Event::End(_)) => trace!("state={:#?}, event=Ok(Event::End(_))", self.state),
|
||||
Ok(Event::Text(_)) => trace!("state={:#?}, event=Ok(Event::Text(_))", self.state),
|
||||
Ok(Event::Empty(_)) => trace!("state={:#?}, event=Ok(Event::Empty(_))", self.state),
|
||||
_ => trace!("state={:#?}, event={:#?}", self.state, event),
|
||||
}
|
||||
|
||||
match (self.state, event) {
|
||||
(State::None, Ok(Event::Start(ref e))) => match e.name() {
|
||||
b"mediawiki" => self.state = State::MediaWiki,
|
||||
tag => self.ignore(tag),
|
||||
},
|
||||
|
||||
(State::MediaWiki, Ok(Event::Start(ref e))) => match e.name() {
|
||||
b"page" => self.state = State::Page,
|
||||
tag => self.ignore(tag),
|
||||
},
|
||||
(State::MediaWiki, Ok(Event::End(ref e))) if e.name() == b"mediawiki" => {
|
||||
self.state = State::None;
|
||||
}
|
||||
(State::MediaWiki, Err(e)) => {
|
||||
self.state = State::MediaWiki;
|
||||
|
||||
debug!(
|
||||
"{:?}: error at position {}: {:?}",
|
||||
self.state,
|
||||
self.reader.buffer_position(),
|
||||
e
|
||||
);
|
||||
}
|
||||
|
||||
(State::Page, Ok(Event::Start(ref e))) => match e.name() {
|
||||
b"title" => self.state = State::Title,
|
||||
b"ns" => self.state = State::Namespace,
|
||||
b"revision" => self.state = State::Revision,
|
||||
tag => self.ignore(tag),
|
||||
},
|
||||
(State::Page, Ok(Event::End(ref e))) if e.name() == b"page" => {
|
||||
self.state = State::MediaWiki;
|
||||
}
|
||||
(State::Page, Err(e)) => {
|
||||
self.state = State::MediaWiki;
|
||||
|
||||
debug!(
|
||||
"{:?}: error at position {}: {:?}",
|
||||
self.state,
|
||||
self.reader.buffer_position(),
|
||||
e
|
||||
);
|
||||
}
|
||||
|
||||
(State::Title, Ok(Event::Text(e))) => {
|
||||
if let Ok(text) = e.unescape_and_decode(&self.reader) {
|
||||
title = Some(text);
|
||||
}
|
||||
}
|
||||
(State::Title, Ok(Event::End(ref e))) if e.name() == b"title" => {
|
||||
self.state = State::Page;
|
||||
}
|
||||
(State::Title, Err(e)) => {
|
||||
self.state = State::Page;
|
||||
|
||||
debug!(
|
||||
"{:?}: error at position {}: {:?}",
|
||||
self.state,
|
||||
self.reader.buffer_position(),
|
||||
e
|
||||
);
|
||||
}
|
||||
|
||||
(State::Namespace, Ok(Event::Text(e))) => {
|
||||
if let Ok(text) = e.unescape_and_decode(&self.reader) {
|
||||
if text != "0" {
|
||||
self.state = State::MediaWiki;
|
||||
}
|
||||
}
|
||||
}
|
||||
(State::Namespace, Ok(Event::End(ref e))) if e.name() == b"ns" => {
|
||||
self.state = State::Page;
|
||||
}
|
||||
(State::Namespace, Err(e)) => {
|
||||
self.state = State::Page;
|
||||
|
||||
debug!(
|
||||
"{:?}: error at position {}: {:?}",
|
||||
self.state,
|
||||
self.reader.buffer_position(),
|
||||
e
|
||||
);
|
||||
}
|
||||
|
||||
(State::Revision, Ok(Event::Start(ref e))) => match e.name() {
|
||||
b"text" => self.state = State::Text,
|
||||
tag => self.ignore(tag),
|
||||
},
|
||||
(State::Revision, Ok(Event::End(ref e))) if e.name() == b"revision" => {
|
||||
self.state = State::Page;
|
||||
}
|
||||
(State::Revision, Err(e)) => {
|
||||
self.state = State::Page;
|
||||
|
||||
debug!(
|
||||
"{:?}: error at position {}: {:?}",
|
||||
self.state,
|
||||
self.reader.buffer_position(),
|
||||
e
|
||||
);
|
||||
}
|
||||
|
||||
(State::Text, Ok(Event::Text(e))) => {
|
||||
if let Ok(text) = e.unescape_and_decode(&self.reader) {
|
||||
return Some((title.unwrap_or_else(|| "".to_string()), text));
|
||||
}
|
||||
}
|
||||
(State::Text, Ok(Event::End(ref e))) if e.name() == b"text" => {
|
||||
self.state = State::Revision;
|
||||
}
|
||||
(State::Text, Err(_)) => {
|
||||
self.state = State::Revision;
|
||||
}
|
||||
|
||||
(_, Ok(Event::Eof)) => break,
|
||||
|
||||
(_, Err(e)) => {
|
||||
self.state = State::MediaWiki;
|
||||
|
||||
debug!(
|
||||
"{:?}: error at position {}: {:?}",
|
||||
self.state,
|
||||
self.reader.buffer_position(),
|
||||
e
|
||||
);
|
||||
}
|
||||
|
||||
_ => (),
|
||||
}
|
||||
|
||||
buffer.clear();
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::fs;
|
||||
use std::io::BufReader;
|
||||
|
||||
use bzip2::read::BzDecoder;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn it_works() {
|
||||
let reader = fs::File::open("/Users/olsson/Laboratory/denc-v2/corpus/wiktionary/svwiktionary-20190920-pages-articles.xml.bz2")
|
||||
.map(BzDecoder::new)
|
||||
.map(BufReader::new)
|
||||
.expect("failed to open path");
|
||||
|
||||
let page_reader = PageReader::from_reader(reader);
|
||||
|
||||
for (title, _) in page_reader {
|
||||
println!("{}", title);
|
||||
}
|
||||
|
||||
assert!(false);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user