Added WIP crate for wiktionary.

This commit is contained in:
2019-09-27 09:45:06 +02:00
parent 22fa3e95c3
commit 9f7d575a68
6 changed files with 307 additions and 170 deletions

130
Cargo.lock generated
View File

@@ -1,35 +1,33 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
[[package]] [[package]]
name = "autocfg" name = "autocfg"
version = "0.1.2" version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "backtrace" name = "bitflags"
version = "0.3.14" version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "bzip2"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", "bzip2-sys 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)",
"backtrace-sys 0.1.28 (registry+https://github.com/rust-lang/crates.io-index)",
"cfg-if 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)",
"rustc-demangle 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]] [[package]]
name = "backtrace-sys" name = "bzip2-sys"
version = "0.1.28" version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"cc 1.0.31 (registry+https://github.com/rust-lang/crates.io-index)", "cc 1.0.31 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]]
name = "bitflags"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "cc" name = "cc"
version = "1.0.31" version = "1.0.31"
@@ -60,35 +58,16 @@ name = "dict-tei"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"dict 0.1.0", "dict 0.1.0",
"quick-xml 0.13.3 (registry+https://github.com/rust-lang/crates.io-index)", "quick-xml 0.16.1 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]] [[package]]
name = "encoding_rs" name = "dict-wiktionary"
version = "0.8.17" version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"cfg-if 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", "bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
] "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
"quick-xml 0.16.1 (registry+https://github.com/rust-lang/crates.io-index)",
[[package]]
name = "failure"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"backtrace 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)",
"failure_derive 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "failure_derive"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)",
"syn 0.15.29 (registry+https://github.com/rust-lang/crates.io-index)",
"synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]] [[package]]
@@ -120,7 +99,7 @@ dependencies = [
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.2.0" version = "2.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
@@ -157,31 +136,12 @@ dependencies = [
"siphasher 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", "siphasher 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]]
name = "proc-macro2"
version = "0.4.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]] [[package]]
name = "quick-xml" name = "quick-xml"
version = "0.13.3" version = "0.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)", "memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "quote"
version = "0.6.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]] [[package]]
@@ -288,42 +248,11 @@ dependencies = [
"rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", "rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]]
name = "rustc-demangle"
version = "0.1.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "siphasher" name = "siphasher"
version = "0.2.3" version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "syn"
version = "0.15.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "synstructure"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)",
"syn 0.15.29 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "unicode-xid"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "winapi" name = "winapi"
version = "0.3.6" version = "0.3.6"
@@ -345,27 +274,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
[metadata] [metadata]
"checksum autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a6d640bee2da49f60a4068a7fae53acde8982514ab7bae8b8cea9e88cbcfd799" "checksum autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a6d640bee2da49f60a4068a7fae53acde8982514ab7bae8b8cea9e88cbcfd799"
"checksum backtrace 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)" = "cd5a90e2b463010cd0e0ce9a11d4a9d5d58d9f41d4a6ba3dcaf9e68b466e88b4"
"checksum backtrace-sys 0.1.28 (registry+https://github.com/rust-lang/crates.io-index)" = "797c830ac25ccc92a7f8a7b9862bde440715531514594a6154e3d4a54dd769b6"
"checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12" "checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12"
"checksum bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "42b7c3cbf0fa9c1b82308d57191728ca0256cb821220f4e2fd410a72ade26e3b"
"checksum bzip2-sys 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "6584aa36f5ad4c9247f5323b0a42f37802b37a836f0ad87084d7a33961abe25f"
"checksum cc 1.0.31 (registry+https://github.com/rust-lang/crates.io-index)" = "c9ce8bb087aacff865633f0bd5aeaed910fe2fe55b55f4739527f2e023a2e53d" "checksum cc 1.0.31 (registry+https://github.com/rust-lang/crates.io-index)" = "c9ce8bb087aacff865633f0bd5aeaed910fe2fe55b55f4739527f2e023a2e53d"
"checksum cfg-if 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "11d43355396e872eefb45ce6342e4374ed7bc2b3a502d1b28e36d6e23c05d1f4" "checksum cfg-if 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "11d43355396e872eefb45ce6342e4374ed7bc2b3a502d1b28e36d6e23c05d1f4"
"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
"checksum encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)" = "4155785c79f2f6701f185eb2e6b4caf0555ec03477cb4c70db67b465311620ed"
"checksum failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "795bd83d3abeb9220f257e597aa0080a508b27533824adf336529648f6abf7e2"
"checksum failure_derive 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "ea1063915fd7ef4309e222a5a07cf9c319fb9c7836b1f89b85458672dbb127e1"
"checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" "checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba"
"checksum isolang 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "265ef164908329e47e753c769b14cbb27434abf0c41984dca201484022f09ce5" "checksum isolang 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "265ef164908329e47e753c769b14cbb27434abf0c41984dca201484022f09ce5"
"checksum libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)" = "aab692d7759f5cd8c859e169db98ae5b52c924add2af5fbbca11d12fefb567c1" "checksum libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)" = "aab692d7759f5cd8c859e169db98ae5b52c924add2af5fbbca11d12fefb567c1"
"checksum log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c84ec4b527950aa83a329754b01dbe3f58361d1c5efacd1f6d68c494d08a17c6" "checksum log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c84ec4b527950aa83a329754b01dbe3f58361d1c5efacd1f6d68c494d08a17c6"
"checksum memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2efc7bc57c883d4a4d6e3246905283d8dae951bb3bd32f49d6ef297f546e1c39" "checksum memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "88579771288728879b57485cc7d6b07d648c9f0141eb955f8ab7f9d45394468e"
"checksum phf 0.7.24 (registry+https://github.com/rust-lang/crates.io-index)" = "b3da44b85f8e8dfaec21adae67f95d93244b2ecf6ad2a692320598dcc8e6dd18" "checksum phf 0.7.24 (registry+https://github.com/rust-lang/crates.io-index)" = "b3da44b85f8e8dfaec21adae67f95d93244b2ecf6ad2a692320598dcc8e6dd18"
"checksum phf_codegen 0.7.24 (registry+https://github.com/rust-lang/crates.io-index)" = "b03e85129e324ad4166b06b2c7491ae27fe3ec353af72e72cd1654c7225d517e" "checksum phf_codegen 0.7.24 (registry+https://github.com/rust-lang/crates.io-index)" = "b03e85129e324ad4166b06b2c7491ae27fe3ec353af72e72cd1654c7225d517e"
"checksum phf_generator 0.7.24 (registry+https://github.com/rust-lang/crates.io-index)" = "09364cc93c159b8b06b1f4dd8a4398984503483891b0c26b867cf431fb132662" "checksum phf_generator 0.7.24 (registry+https://github.com/rust-lang/crates.io-index)" = "09364cc93c159b8b06b1f4dd8a4398984503483891b0c26b867cf431fb132662"
"checksum phf_shared 0.7.24 (registry+https://github.com/rust-lang/crates.io-index)" = "234f71a15de2288bcb7e3b6515828d22af7ec8598ee6d24c3b526fa0a80b67a0" "checksum phf_shared 0.7.24 (registry+https://github.com/rust-lang/crates.io-index)" = "234f71a15de2288bcb7e3b6515828d22af7ec8598ee6d24c3b526fa0a80b67a0"
"checksum proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)" = "4d317f9caece796be1980837fd5cb3dfec5613ebdb04ad0956deea83ce168915" "checksum quick-xml 0.16.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1cd45021132c1cb5540995e93fcc2cf5a874ef84f9639168fb6819caa023d4be"
"checksum quick-xml 0.13.3 (registry+https://github.com/rust-lang/crates.io-index)" = "22fcc48ecef4609b243e8c01ff4695d08ee0fc9d5bdbc54630e1a5fe8bb40953"
"checksum quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)" = "cdd8e04bd9c52e0342b406469d494fcb033be4bdbe5c606016defbb1681411e1"
"checksum rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca" "checksum rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca"
"checksum rand_chacha 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef" "checksum rand_chacha 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef"
"checksum rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" "checksum rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b"
@@ -377,11 +301,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum rand_pcg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44" "checksum rand_pcg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44"
"checksum rand_xorshift 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c" "checksum rand_xorshift 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c"
"checksum rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" "checksum rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2"
"checksum rustc-demangle 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "adacaae16d02b6ec37fdc7acfcddf365978de76d1983d3ee22afc260e1ca9619"
"checksum siphasher 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "0b8de496cf83d4ed58b6be86c3a275b8602f6ffe98d3024a869e124147a9a3ac" "checksum siphasher 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "0b8de496cf83d4ed58b6be86c3a275b8602f6ffe98d3024a869e124147a9a3ac"
"checksum syn 0.15.29 (registry+https://github.com/rust-lang/crates.io-index)" = "1825685f977249735d510a242a6727b46efe914bb67e38d30c071b1b72b1d5c2"
"checksum synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "73687139bf99285483c96ac0add482c3776528beac1d97d444f6e91f203a2015"
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
"checksum winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0" "checksum winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0"
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" "checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"

View File

@@ -2,4 +2,5 @@
members = [ members = [
"dict", "dict",
"dict-tei", "dict-tei",
"dict-wiktionary",
] ]

View File

@@ -5,5 +5,7 @@ authors = ["Anders Olsson <anders.e.olsson@gmail.com>"]
edition = "2018" edition = "2018"
[dependencies] [dependencies]
quick-xml = "0.13" quick-xml = "0.16"
dict = { path = "../dict" }
[dependencies.dict]
path = "../dict"

View File

@@ -136,7 +136,12 @@ impl<B: BufRead> Iterator for Reader<B> {
.filter_map(Result::ok) .filter_map(Result::ok)
.find(|a| a.key == b"xml:lang") .find(|a| a.key == b"xml:lang")
.and_then(|a| { .and_then(|a| {
Language::from_639_1(self.inner.decode(a.value.as_ref()).as_ref()) Language::from_639_1(
self.inner
.decode(a.value.as_ref())
.expect("expected language")
.as_ref(),
)
}) })
{ {
self.language = Some(lang); self.language = Some(lang);
@@ -201,7 +206,10 @@ impl<B: BufRead> Iterator for Reader<B> {
.find(|a| a.key == b"xml:lang") .find(|a| a.key == b"xml:lang")
.and_then(|a| { .and_then(|a| {
Language::from_639_1( Language::from_639_1(
self.inner.decode(a.value.as_ref()).as_ref(), self.inner
.decode(a.value.as_ref())
.expect("expected language")
.as_ref(),
) )
}); });
@@ -250,66 +258,6 @@ impl<B: BufRead> Iterator for Reader<B> {
} }
} }
impl<B: BufRead> Reader<B> {
pub fn parse(&mut self) {
enum State {
None,
Definition(String, Vec<String>),
}
self.inner.trim_text(true);
let mut buf = Vec::new();
let mut state = State::None;
let mut words = Vec::new();
loop {
state = match (state, self.inner.read_event(&mut buf)) {
(State::None, Ok(Event::Start(ref e))) => match e.name() {
b"orth" => {
let word = self.inner.read_text(e.name(), &mut Vec::new()).unwrap();
State::Definition(word, Vec::new())
}
_ => State::None,
},
(State::Definition(word, mut translations), Ok(Event::Start(e))) => {
match e.name() {
b"orth" => {
words.push((word, translations));
let word = self.inner.read_text(e.name(), &mut Vec::new()).unwrap();
State::Definition(word, Vec::new())
}
b"quote" => {
let translation =
self.inner.read_text(e.name(), &mut Vec::new()).unwrap();
translations.push(translation);
State::Definition(word, translations)
}
_ => State::Definition(word, translations),
}
}
(_, Err(e)) => panic!(
"Error at position {}: {:?}",
self.inner.buffer_position(),
e
),
(_, Ok(Event::Eof)) => break,
(state, _) => state,
};
buf.clear();
}
println!("{:#?}", words);
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
@@ -340,7 +288,7 @@ mod tests {
vec![] vec![]
}); });
reader.parse(); // reader.parse();
assert!(true == false); assert!(true == false);
} }

View File

@@ -0,0 +1,10 @@
[package]
name = "dict-wiktionary"
version = "0.1.0"
authors = ["Anders Olsson <anders.e.olsson@gmail.com>"]
edition = "2018"
[dependencies]
bzip2 = "0.3"
log = "0.4"
quick-xml = "0.16"

256
dict-wiktionary/src/lib.rs Normal file
View File

@@ -0,0 +1,256 @@
use std::io::BufRead;
use log::*;
use quick_xml::events::Event;
use quick_xml::Reader;
#[derive(Clone, Copy, Debug)]
enum State {
None,
MediaWiki,
Page,
Title,
Namespace,
Revision,
Text,
}
pub struct PageReader<B: BufRead> {
reader: Reader<B>,
state: State,
}
impl<B: BufRead> PageReader<B> {
pub fn from_reader(reader: B) -> PageReader<B> {
let mut reader = Reader::from_reader(reader);
reader
.expand_empty_elements(true)
.trim_text(false)
.check_end_names(true);
PageReader {
reader,
state: State::None,
}
}
fn ignore(&mut self, tag: &[u8]) {
let mut buffer = Vec::with_capacity(256);
let mut depth = 0;
loop {
match self.reader.read_event(&mut buffer) {
Ok(Event::Start(_)) => {
depth += 1;
}
Ok(Event::End(ref e)) if e.name() == tag && depth == 0 => {
break;
}
Ok(Event::End(_)) if depth == 0 => {
// This is bad, this shouldn't be possible. Might be a bad xml file.
// Should we panic? Should we return a error? Just break for now...
break;
}
Ok(Event::End(_)) => {
depth -= 1;
}
Err(e) => {
// self.state = State::MediaWiki;
debug!(
"ignore: error at position {}: {:?}",
self.reader.buffer_position(),
e
);
break;
}
_ => (),
}
buffer.clear();
}
}
}
impl<B: BufRead> Iterator for PageReader<B> {
type Item = (String, String);
fn next(&mut self) -> Option<Self::Item> {
let mut title = None;
let mut buffer = Vec::with_capacity(265);
loop {
let event = self.reader.read_event(&mut buffer);
match event {
Ok(Event::Start(_)) => trace!("state={:#?}, event=Ok(Event::Start(_))", self.state),
Ok(Event::End(_)) => trace!("state={:#?}, event=Ok(Event::End(_))", self.state),
Ok(Event::Text(_)) => trace!("state={:#?}, event=Ok(Event::Text(_))", self.state),
Ok(Event::Empty(_)) => trace!("state={:#?}, event=Ok(Event::Empty(_))", self.state),
_ => trace!("state={:#?}, event={:#?}", self.state, event),
}
match (self.state, event) {
(State::None, Ok(Event::Start(ref e))) => match e.name() {
b"mediawiki" => self.state = State::MediaWiki,
tag => self.ignore(tag),
},
(State::MediaWiki, Ok(Event::Start(ref e))) => match e.name() {
b"page" => self.state = State::Page,
tag => self.ignore(tag),
},
(State::MediaWiki, Ok(Event::End(ref e))) if e.name() == b"mediawiki" => {
self.state = State::None;
}
(State::MediaWiki, Err(e)) => {
self.state = State::MediaWiki;
debug!(
"{:?}: error at position {}: {:?}",
self.state,
self.reader.buffer_position(),
e
);
}
(State::Page, Ok(Event::Start(ref e))) => match e.name() {
b"title" => self.state = State::Title,
b"ns" => self.state = State::Namespace,
b"revision" => self.state = State::Revision,
tag => self.ignore(tag),
},
(State::Page, Ok(Event::End(ref e))) if e.name() == b"page" => {
self.state = State::MediaWiki;
}
(State::Page, Err(e)) => {
self.state = State::MediaWiki;
debug!(
"{:?}: error at position {}: {:?}",
self.state,
self.reader.buffer_position(),
e
);
}
(State::Title, Ok(Event::Text(e))) => {
if let Ok(text) = e.unescape_and_decode(&self.reader) {
title = Some(text);
}
}
(State::Title, Ok(Event::End(ref e))) if e.name() == b"title" => {
self.state = State::Page;
}
(State::Title, Err(e)) => {
self.state = State::Page;
debug!(
"{:?}: error at position {}: {:?}",
self.state,
self.reader.buffer_position(),
e
);
}
(State::Namespace, Ok(Event::Text(e))) => {
if let Ok(text) = e.unescape_and_decode(&self.reader) {
if text != "0" {
self.state = State::MediaWiki;
}
}
}
(State::Namespace, Ok(Event::End(ref e))) if e.name() == b"ns" => {
self.state = State::Page;
}
(State::Namespace, Err(e)) => {
self.state = State::Page;
debug!(
"{:?}: error at position {}: {:?}",
self.state,
self.reader.buffer_position(),
e
);
}
(State::Revision, Ok(Event::Start(ref e))) => match e.name() {
b"text" => self.state = State::Text,
tag => self.ignore(tag),
},
(State::Revision, Ok(Event::End(ref e))) if e.name() == b"revision" => {
self.state = State::Page;
}
(State::Revision, Err(e)) => {
self.state = State::Page;
debug!(
"{:?}: error at position {}: {:?}",
self.state,
self.reader.buffer_position(),
e
);
}
(State::Text, Ok(Event::Text(e))) => {
if let Ok(text) = e.unescape_and_decode(&self.reader) {
return Some((title.unwrap_or_else(|| "".to_string()), text));
}
}
(State::Text, Ok(Event::End(ref e))) if e.name() == b"text" => {
self.state = State::Revision;
}
(State::Text, Err(_)) => {
self.state = State::Revision;
}
(_, Ok(Event::Eof)) => break,
(_, Err(e)) => {
self.state = State::MediaWiki;
debug!(
"{:?}: error at position {}: {:?}",
self.state,
self.reader.buffer_position(),
e
);
}
_ => (),
}
buffer.clear();
}
None
}
}
#[cfg(test)]
mod tests {
use std::fs;
use std::io::BufReader;
use bzip2::read::BzDecoder;
use super::*;
#[test]
fn it_works() {
let reader = fs::File::open("/Users/olsson/Laboratory/denc-v2/corpus/wiktionary/svwiktionary-20190920-pages-articles.xml.bz2")
.map(BzDecoder::new)
.map(BufReader::new)
.expect("failed to open path");
let page_reader = PageReader::from_reader(reader);
for (title, _) in page_reader {
println!("{}", title);
}
assert!(false);
}
}