From 9f7d575a68fab2961a94b34635884a2353378100 Mon Sep 17 00:00:00 2001 From: Anders Olsson Date: Fri, 27 Sep 2019 09:45:06 +0200 Subject: [PATCH] Added WIP crate for wiktionary. --- Cargo.lock | 130 ++++--------------- Cargo.toml | 1 + dict-tei/Cargo.toml | 6 +- dict-tei/src/lib.rs | 74 ++--------- dict-wiktionary/Cargo.toml | 10 ++ dict-wiktionary/src/lib.rs | 256 +++++++++++++++++++++++++++++++++++++ 6 files changed, 307 insertions(+), 170 deletions(-) create mode 100644 dict-wiktionary/Cargo.toml create mode 100644 dict-wiktionary/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index feb0cab..a25df64 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,35 +1,33 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. [[package]] name = "autocfg" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] -name = "backtrace" -version = "0.3.14" +name = "bitflags" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "bzip2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", - "backtrace-sys 0.1.28 (registry+https://github.com/rust-lang/crates.io-index)", - "cfg-if 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", + "bzip2-sys 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)", - "rustc-demangle 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] -name = "backtrace-sys" -version = "0.1.28" +name = "bzip2-sys" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "cc 1.0.31 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "bitflags" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "cc" version = "1.0.31" @@ -60,35 +58,16 @@ name = "dict-tei" version = "0.1.0" dependencies = [ "dict 0.1.0", - "quick-xml 0.13.3 (registry+https://github.com/rust-lang/crates.io-index)", + "quick-xml 0.16.1 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] -name = "encoding_rs" -version = "0.8.17" -source = "registry+https://github.com/rust-lang/crates.io-index" +name = "dict-wiktionary" +version = "0.1.0" dependencies = [ - "cfg-if 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "failure" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "backtrace 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)", - "failure_derive 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "failure_derive" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)", - "quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)", - "syn 0.15.29 (registry+https://github.com/rust-lang/crates.io-index)", - "synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)", + "bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "quick-xml 0.16.1 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -120,7 +99,7 @@ dependencies = [ [[package]] name = "memchr" -version = "2.2.0" +version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -157,31 +136,12 @@ dependencies = [ "siphasher 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "proc-macro2" -version = "0.4.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "quick-xml" -version = "0.13.3" +version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)", - "failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", - "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", - "memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "quote" -version = "0.6.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -288,42 +248,11 @@ dependencies = [ "rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "rustc-demangle" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "siphasher" version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -[[package]] -name = "syn" -version = "0.15.29" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)", - "quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)", - "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "synstructure" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)", - "quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)", - "syn 0.15.29 (registry+https://github.com/rust-lang/crates.io-index)", - "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "unicode-xid" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "winapi" version = "0.3.6" @@ -345,27 +274,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [metadata] "checksum autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a6d640bee2da49f60a4068a7fae53acde8982514ab7bae8b8cea9e88cbcfd799" -"checksum backtrace 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)" = "cd5a90e2b463010cd0e0ce9a11d4a9d5d58d9f41d4a6ba3dcaf9e68b466e88b4" -"checksum backtrace-sys 0.1.28 (registry+https://github.com/rust-lang/crates.io-index)" = "797c830ac25ccc92a7f8a7b9862bde440715531514594a6154e3d4a54dd769b6" "checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12" +"checksum bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "42b7c3cbf0fa9c1b82308d57191728ca0256cb821220f4e2fd410a72ade26e3b" +"checksum bzip2-sys 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "6584aa36f5ad4c9247f5323b0a42f37802b37a836f0ad87084d7a33961abe25f" "checksum cc 1.0.31 (registry+https://github.com/rust-lang/crates.io-index)" = "c9ce8bb087aacff865633f0bd5aeaed910fe2fe55b55f4739527f2e023a2e53d" "checksum cfg-if 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "11d43355396e872eefb45ce6342e4374ed7bc2b3a502d1b28e36d6e23c05d1f4" "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" -"checksum encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)" = "4155785c79f2f6701f185eb2e6b4caf0555ec03477cb4c70db67b465311620ed" -"checksum failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "795bd83d3abeb9220f257e597aa0080a508b27533824adf336529648f6abf7e2" -"checksum failure_derive 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "ea1063915fd7ef4309e222a5a07cf9c319fb9c7836b1f89b85458672dbb127e1" "checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" "checksum isolang 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "265ef164908329e47e753c769b14cbb27434abf0c41984dca201484022f09ce5" "checksum libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)" = "aab692d7759f5cd8c859e169db98ae5b52c924add2af5fbbca11d12fefb567c1" "checksum log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c84ec4b527950aa83a329754b01dbe3f58361d1c5efacd1f6d68c494d08a17c6" -"checksum memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2efc7bc57c883d4a4d6e3246905283d8dae951bb3bd32f49d6ef297f546e1c39" +"checksum memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "88579771288728879b57485cc7d6b07d648c9f0141eb955f8ab7f9d45394468e" "checksum phf 0.7.24 (registry+https://github.com/rust-lang/crates.io-index)" = "b3da44b85f8e8dfaec21adae67f95d93244b2ecf6ad2a692320598dcc8e6dd18" "checksum phf_codegen 0.7.24 (registry+https://github.com/rust-lang/crates.io-index)" = "b03e85129e324ad4166b06b2c7491ae27fe3ec353af72e72cd1654c7225d517e" "checksum phf_generator 0.7.24 (registry+https://github.com/rust-lang/crates.io-index)" = "09364cc93c159b8b06b1f4dd8a4398984503483891b0c26b867cf431fb132662" "checksum phf_shared 0.7.24 (registry+https://github.com/rust-lang/crates.io-index)" = "234f71a15de2288bcb7e3b6515828d22af7ec8598ee6d24c3b526fa0a80b67a0" -"checksum proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)" = "4d317f9caece796be1980837fd5cb3dfec5613ebdb04ad0956deea83ce168915" -"checksum quick-xml 0.13.3 (registry+https://github.com/rust-lang/crates.io-index)" = "22fcc48ecef4609b243e8c01ff4695d08ee0fc9d5bdbc54630e1a5fe8bb40953" -"checksum quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)" = "cdd8e04bd9c52e0342b406469d494fcb033be4bdbe5c606016defbb1681411e1" +"checksum quick-xml 0.16.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1cd45021132c1cb5540995e93fcc2cf5a874ef84f9639168fb6819caa023d4be" "checksum rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca" "checksum rand_chacha 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef" "checksum rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" @@ -377,11 +301,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum rand_pcg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44" "checksum rand_xorshift 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c" "checksum rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" -"checksum rustc-demangle 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "adacaae16d02b6ec37fdc7acfcddf365978de76d1983d3ee22afc260e1ca9619" "checksum siphasher 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "0b8de496cf83d4ed58b6be86c3a275b8602f6ffe98d3024a869e124147a9a3ac" -"checksum syn 0.15.29 (registry+https://github.com/rust-lang/crates.io-index)" = "1825685f977249735d510a242a6727b46efe914bb67e38d30c071b1b72b1d5c2" -"checksum synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "73687139bf99285483c96ac0add482c3776528beac1d97d444f6e91f203a2015" -"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" "checksum winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" "checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/Cargo.toml b/Cargo.toml index 8282499..2cbbf24 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,4 +2,5 @@ members = [ "dict", "dict-tei", + "dict-wiktionary", ] diff --git a/dict-tei/Cargo.toml b/dict-tei/Cargo.toml index 3cd6029..f22cae9 100644 --- a/dict-tei/Cargo.toml +++ b/dict-tei/Cargo.toml @@ -5,5 +5,7 @@ authors = ["Anders Olsson "] edition = "2018" [dependencies] -quick-xml = "0.13" -dict = { path = "../dict" } \ No newline at end of file +quick-xml = "0.16" + +[dependencies.dict] +path = "../dict" diff --git a/dict-tei/src/lib.rs b/dict-tei/src/lib.rs index 77428ab..c30b4f2 100644 --- a/dict-tei/src/lib.rs +++ b/dict-tei/src/lib.rs @@ -136,7 +136,12 @@ impl Iterator for Reader { .filter_map(Result::ok) .find(|a| a.key == b"xml:lang") .and_then(|a| { - Language::from_639_1(self.inner.decode(a.value.as_ref()).as_ref()) + Language::from_639_1( + self.inner + .decode(a.value.as_ref()) + .expect("expected language") + .as_ref(), + ) }) { self.language = Some(lang); @@ -201,7 +206,10 @@ impl Iterator for Reader { .find(|a| a.key == b"xml:lang") .and_then(|a| { Language::from_639_1( - self.inner.decode(a.value.as_ref()).as_ref(), + self.inner + .decode(a.value.as_ref()) + .expect("expected language") + .as_ref(), ) }); @@ -250,66 +258,6 @@ impl Iterator for Reader { } } -impl Reader { - pub fn parse(&mut self) { - enum State { - None, - Definition(String, Vec), - } - - self.inner.trim_text(true); - - let mut buf = Vec::new(); - - let mut state = State::None; - let mut words = Vec::new(); - - loop { - state = match (state, self.inner.read_event(&mut buf)) { - (State::None, Ok(Event::Start(ref e))) => match e.name() { - b"orth" => { - let word = self.inner.read_text(e.name(), &mut Vec::new()).unwrap(); - - State::Definition(word, Vec::new()) - } - _ => State::None, - }, - (State::Definition(word, mut translations), Ok(Event::Start(e))) => { - match e.name() { - b"orth" => { - words.push((word, translations)); - - let word = self.inner.read_text(e.name(), &mut Vec::new()).unwrap(); - - State::Definition(word, Vec::new()) - } - b"quote" => { - let translation = - self.inner.read_text(e.name(), &mut Vec::new()).unwrap(); - - translations.push(translation); - - State::Definition(word, translations) - } - _ => State::Definition(word, translations), - } - } - (_, Err(e)) => panic!( - "Error at position {}: {:?}", - self.inner.buffer_position(), - e - ), - (_, Ok(Event::Eof)) => break, - (state, _) => state, - }; - - buf.clear(); - } - - println!("{:#?}", words); - } -} - #[cfg(test)] mod tests { use super::*; @@ -340,7 +288,7 @@ mod tests { vec![] }); - reader.parse(); + // reader.parse(); assert!(true == false); } diff --git a/dict-wiktionary/Cargo.toml b/dict-wiktionary/Cargo.toml new file mode 100644 index 0000000..1844bcd --- /dev/null +++ b/dict-wiktionary/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "dict-wiktionary" +version = "0.1.0" +authors = ["Anders Olsson "] +edition = "2018" + +[dependencies] +bzip2 = "0.3" +log = "0.4" +quick-xml = "0.16" diff --git a/dict-wiktionary/src/lib.rs b/dict-wiktionary/src/lib.rs new file mode 100644 index 0000000..1e12306 --- /dev/null +++ b/dict-wiktionary/src/lib.rs @@ -0,0 +1,256 @@ +use std::io::BufRead; + +use log::*; +use quick_xml::events::Event; +use quick_xml::Reader; + +#[derive(Clone, Copy, Debug)] +enum State { + None, + MediaWiki, + Page, + Title, + Namespace, + Revision, + Text, +} + +pub struct PageReader { + reader: Reader, + state: State, +} + +impl PageReader { + pub fn from_reader(reader: B) -> PageReader { + let mut reader = Reader::from_reader(reader); + + reader + .expand_empty_elements(true) + .trim_text(false) + .check_end_names(true); + + PageReader { + reader, + state: State::None, + } + } + + fn ignore(&mut self, tag: &[u8]) { + let mut buffer = Vec::with_capacity(256); + let mut depth = 0; + + loop { + match self.reader.read_event(&mut buffer) { + Ok(Event::Start(_)) => { + depth += 1; + } + Ok(Event::End(ref e)) if e.name() == tag && depth == 0 => { + break; + } + Ok(Event::End(_)) if depth == 0 => { + // This is bad, this shouldn't be possible. Might be a bad xml file. + // Should we panic? Should we return a error? Just break for now... + break; + } + Ok(Event::End(_)) => { + depth -= 1; + } + Err(e) => { + // self.state = State::MediaWiki; + + debug!( + "ignore: error at position {}: {:?}", + self.reader.buffer_position(), + e + ); + + break; + } + _ => (), + } + + buffer.clear(); + } + } +} + +impl Iterator for PageReader { + type Item = (String, String); + + fn next(&mut self) -> Option { + let mut title = None; + let mut buffer = Vec::with_capacity(265); + + loop { + let event = self.reader.read_event(&mut buffer); + + match event { + Ok(Event::Start(_)) => trace!("state={:#?}, event=Ok(Event::Start(_))", self.state), + Ok(Event::End(_)) => trace!("state={:#?}, event=Ok(Event::End(_))", self.state), + Ok(Event::Text(_)) => trace!("state={:#?}, event=Ok(Event::Text(_))", self.state), + Ok(Event::Empty(_)) => trace!("state={:#?}, event=Ok(Event::Empty(_))", self.state), + _ => trace!("state={:#?}, event={:#?}", self.state, event), + } + + match (self.state, event) { + (State::None, Ok(Event::Start(ref e))) => match e.name() { + b"mediawiki" => self.state = State::MediaWiki, + tag => self.ignore(tag), + }, + + (State::MediaWiki, Ok(Event::Start(ref e))) => match e.name() { + b"page" => self.state = State::Page, + tag => self.ignore(tag), + }, + (State::MediaWiki, Ok(Event::End(ref e))) if e.name() == b"mediawiki" => { + self.state = State::None; + } + (State::MediaWiki, Err(e)) => { + self.state = State::MediaWiki; + + debug!( + "{:?}: error at position {}: {:?}", + self.state, + self.reader.buffer_position(), + e + ); + } + + (State::Page, Ok(Event::Start(ref e))) => match e.name() { + b"title" => self.state = State::Title, + b"ns" => self.state = State::Namespace, + b"revision" => self.state = State::Revision, + tag => self.ignore(tag), + }, + (State::Page, Ok(Event::End(ref e))) if e.name() == b"page" => { + self.state = State::MediaWiki; + } + (State::Page, Err(e)) => { + self.state = State::MediaWiki; + + debug!( + "{:?}: error at position {}: {:?}", + self.state, + self.reader.buffer_position(), + e + ); + } + + (State::Title, Ok(Event::Text(e))) => { + if let Ok(text) = e.unescape_and_decode(&self.reader) { + title = Some(text); + } + } + (State::Title, Ok(Event::End(ref e))) if e.name() == b"title" => { + self.state = State::Page; + } + (State::Title, Err(e)) => { + self.state = State::Page; + + debug!( + "{:?}: error at position {}: {:?}", + self.state, + self.reader.buffer_position(), + e + ); + } + + (State::Namespace, Ok(Event::Text(e))) => { + if let Ok(text) = e.unescape_and_decode(&self.reader) { + if text != "0" { + self.state = State::MediaWiki; + } + } + } + (State::Namespace, Ok(Event::End(ref e))) if e.name() == b"ns" => { + self.state = State::Page; + } + (State::Namespace, Err(e)) => { + self.state = State::Page; + + debug!( + "{:?}: error at position {}: {:?}", + self.state, + self.reader.buffer_position(), + e + ); + } + + (State::Revision, Ok(Event::Start(ref e))) => match e.name() { + b"text" => self.state = State::Text, + tag => self.ignore(tag), + }, + (State::Revision, Ok(Event::End(ref e))) if e.name() == b"revision" => { + self.state = State::Page; + } + (State::Revision, Err(e)) => { + self.state = State::Page; + + debug!( + "{:?}: error at position {}: {:?}", + self.state, + self.reader.buffer_position(), + e + ); + } + + (State::Text, Ok(Event::Text(e))) => { + if let Ok(text) = e.unescape_and_decode(&self.reader) { + return Some((title.unwrap_or_else(|| "".to_string()), text)); + } + } + (State::Text, Ok(Event::End(ref e))) if e.name() == b"text" => { + self.state = State::Revision; + } + (State::Text, Err(_)) => { + self.state = State::Revision; + } + + (_, Ok(Event::Eof)) => break, + + (_, Err(e)) => { + self.state = State::MediaWiki; + + debug!( + "{:?}: error at position {}: {:?}", + self.state, + self.reader.buffer_position(), + e + ); + } + + _ => (), + } + + buffer.clear(); + } + + None + } +} + +#[cfg(test)] +mod tests { + use std::fs; + use std::io::BufReader; + + use bzip2::read::BzDecoder; + + use super::*; + + #[test] + fn it_works() { + let reader = fs::File::open("/Users/olsson/Laboratory/denc-v2/corpus/wiktionary/svwiktionary-20190920-pages-articles.xml.bz2") + .map(BzDecoder::new) + .map(BufReader::new) + .expect("failed to open path"); + + let page_reader = PageReader::from_reader(reader); + + for (title, _) in page_reader { + println!("{}", title); + } + + assert!(false); + } +}