diff --git a/Cargo.lock b/Cargo.lock index e090e04..fd8f696 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -87,9 +87,10 @@ dependencies = [ [[package]] name = "bincode" -version = "1.0.1" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ + "autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", "byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.87 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -1570,6 +1571,15 @@ dependencies = [ "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "tinytemplate" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "serde 1.0.87 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_json 1.0.38 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "tokio" version = "0.1.15" @@ -1673,6 +1683,14 @@ dependencies = [ "tokio-executor 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "toml" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "serde 1.0.87 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "try-lock" version = "0.2.2" @@ -1811,7 +1829,7 @@ dependencies = [ name = "whoareyou" version = "0.1.0" dependencies = [ - "bincode 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", + "bincode 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)", "chrono 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", "chrono-tz 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "directories 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", @@ -1826,6 +1844,8 @@ dependencies = [ "serde 1.0.87 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.38 (registry+https://github.com/rust-lang/crates.io-index)", "structopt 0.2.14 (registry+https://github.com/rust-lang/crates.io-index)", + "tinytemplate 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", + "toml 0.4.10 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -1886,7 +1906,7 @@ dependencies = [ "checksum backtrace 0.3.13 (registry+https://github.com/rust-lang/crates.io-index)" = "b5b493b66e03090ebc4343eb02f94ff944e0cbc9ac6571491d170ba026741eb5" "checksum backtrace-sys 0.1.28 (registry+https://github.com/rust-lang/crates.io-index)" = "797c830ac25ccc92a7f8a7b9862bde440715531514594a6154e3d4a54dd769b6" "checksum base64 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0b25d992356d2eb0ed82172f5248873db5560c4721f564b13cb5193bda5e668e" -"checksum bincode 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "9f2fb9e29e72fd6bc12071533d5dc7664cb01480c59406f656d7ac25c7bd8ff7" +"checksum bincode 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "58470ad6460f0b0e89b0df5f17b8bd77ebae26af69dca0bd9ddc8b9e38abb2ff" "checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12" "checksum block-buffer 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a076c298b9ecdb530ed9d967e74a6027d6a7478924520acddcddc24c1c8ab3ab" "checksum byte-tools 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "560c32574a12a89ecd91f5e742165893f86e3ab98d21f8ea548658eb9eef5f40" @@ -2053,6 +2073,7 @@ dependencies = [ "checksum thin-slice 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c" "checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b" "checksum time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)" = "db8dcfca086c1143c9270ac42a2bbd8a7ee477b78ac8e45b19abfb0cbede4b6f" +"checksum tinytemplate 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7655088894274afb52b807bd3c87072daa1fedd155068b8705cabfd628956115" "checksum tokio 0.1.15 (registry+https://github.com/rust-lang/crates.io-index)" = "e0500b88064f08bebddd0c0bed39e19f5c567a5f30975bee52b0c0d3e2eeb38c" "checksum tokio-current-thread 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "331c8acc267855ec06eb0c94618dcbbfea45bed2d20b77252940095273fb58f6" "checksum tokio-executor 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "30c6dbf2d1ad1de300b393910e8a3aa272b724a400b6531da03eed99e329fbf0" @@ -2061,6 +2082,7 @@ dependencies = [ "checksum tokio-tcp 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "1d14b10654be682ac43efee27401d792507e30fd8d26389e1da3b185de2e4119" "checksum tokio-threadpool 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "c3fd86cb15547d02daa2b21aadaf4e37dee3368df38a526178a5afa3c034d2fb" "checksum tokio-timer 0.2.10 (registry+https://github.com/rust-lang/crates.io-index)" = "2910970404ba6fa78c5539126a9ae2045d62e3713041e447f695f41405a120c6" +"checksum toml 0.4.10 (registry+https://github.com/rust-lang/crates.io-index)" = "758664fc71a3a69038656bee8b6be6477d2a6c315a6b81f7081f591bffa4111f" "checksum try-lock 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e604eb7b43c06650e854be16a2a03155743d3752dd1c943f6829e26b7a36e382" "checksum typenum 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "612d636f949607bdf9b123b4a6f6d966dedf3ff669f7f045890d3a4a73948169" "checksum ucd-trie 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "71a9c5b1fe77426cf144cc30e49e955270f5086e31a6441dfa8b32efc09b9d77" diff --git a/Cargo.toml b/Cargo.toml index c2f04ae..572b2f1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,7 @@ authors = ["Anders Olsson "] edition = "2018" [dependencies] -bincode = "1.0" +bincode = "1.1" chrono = { version = "0.4", features = ["serde"] } chrono-tz = "0.5" directories = "1.0" @@ -19,6 +19,8 @@ scraper = "0.9" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" structopt = "0.2" +tinytemplate = "1.0" +toml = "0.4" [dev-dependencies] insta = "0.6" diff --git a/definitions/eniro.toml b/definitions/eniro.toml new file mode 100644 index 0000000..bb1ecc4 --- /dev/null +++ b/definitions/eniro.toml @@ -0,0 +1,11 @@ +name = "eniro.se" +path = "https://gulasidorna.eniro.se/hitta:{ number }" + +[[messages]] +selector = ".CompanyResultListItem h3.name > a" + +[[history]] +selector = "div.PhoneNoHit div.search-info-container p" + +[[history]] +selector = "div.feedback-types div.feedback-type-item" diff --git a/definitions/konsument_info.toml b/definitions/konsument_info.toml new file mode 100644 index 0000000..2c54f74 --- /dev/null +++ b/definitions/konsument_info.toml @@ -0,0 +1,5 @@ +name = "konsumentinfo.se" +path = "http://konsumentinfo.se/telefonnummer/sverige/{ number }" + +[[messages]] +selector = ".panel-heading > h1:nth-child(3)" diff --git a/definitions/telefonforsaljare.toml b/definitions/telefonforsaljare.toml new file mode 100644 index 0000000..f859a70 --- /dev/null +++ b/definitions/telefonforsaljare.toml @@ -0,0 +1,29 @@ +name = "telefonforsaljare.nu" +path = "http://www.telefonforsaljare.nu/telefonnummer/{ number }/" + +[[messages]] +selector = "#content p:nth-child(2) i" + +[[history]] +selector = "#content p:nth-child(4)" + +[[history]] +selector = "#content p:nth-child(5)" + +[[comments]] +selector = "#kommentarer > [itemtype='http://data-vocabulary.org/Review']" + +[comments.date_time] +selector = "small" +data = "attr:datetime" +kind = "date_time" +format = "%Y-%m-%d %H:%M:%S" +tz = "Europe/Stockholm" + +[comments.title] +selector = "h3" +data = "inner_html" + +[comments.message] +selector = "[itemprop='description']" +data = "inner_html" diff --git a/definitions/vem_ringde.toml b/definitions/vem_ringde.toml new file mode 100644 index 0000000..c2fc328 --- /dev/null +++ b/definitions/vem_ringde.toml @@ -0,0 +1,18 @@ +name = "vemringde.se" +path = "http://vemringde.se/?q={ number }" + +[[messages]] +selector = "#toporganisations li" + +[[comments]] +selector = "#calls ol li" + +[comments.date_time] +selector = "div:nth-child(4)" +data = "inner_html" +kind = "date" +format = "%Y-%m-%d" +tz = "Europe/Stockholm" + +[comments.message] +selector = "div:nth-child(3)" diff --git a/src/definition.rs b/src/definition.rs new file mode 100644 index 0000000..760b59d --- /dev/null +++ b/src/definition.rs @@ -0,0 +1,283 @@ +use std::str; + +use chrono_tz::Tz; +use scraper::{ElementRef, Html, Selector}; +use serde::{de, Deserialize, Deserializer, Serialize}; +use tinytemplate::TinyTemplate; + +use crate::entry::{self, Date, Entry}; +use crate::probe::Probe; + +#[derive(Serialize)] +struct Context { + number: String, +} + +#[derive(Debug, Deserialize)] +pub struct Definition { + name: String, + path: String, + messages: Vec, + #[serde(default)] + history: Vec, + #[serde(default)] + comments: Vec, +} + +#[derive(Debug, Deserialize)] +struct Comment { + #[serde(deserialize_with = "deserialize_selector")] + selector: Selector, + #[serde(rename = "date_time")] + datetime: Option, + title: Option, + message: Option, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "snake_case")] +struct DateTime { + #[serde(flatten)] + field: Field, + kind: DateTimeKind, + format: String, + #[serde(deserialize_with = "deserialize_tz")] + tz: Tz, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "snake_case")] +enum DateTimeKind { + Date, + DateTime, +} + +#[derive(Debug, Deserialize)] +#[serde(tag = "type", rename_all = "snake_case")] +enum Filter {} + +#[derive(Debug, Deserialize)] +struct Field { + #[serde(deserialize_with = "deserialize_selector")] + selector: Selector, + #[serde(default)] + data: Data, + #[serde(default)] + filters: Vec, +} + +#[derive(Debug)] +enum Data { + Text, + InnerHtml, + Attr { attr: String }, +} + +impl Data { + fn extract(&self, element: &ElementRef) -> Option { + match self { + Data::Text => Some( + element + .text() + .map(str::trim) + .filter(|s| !s.is_empty()) + .collect::>() + .join(" "), + ), + Data::InnerHtml => Some(element.inner_html()), + Data::Attr { attr } => element.value().attr(attr).map(|data| data.to_string()), + } + } +} + +impl Default for Data { + fn default() -> Self { + Data::Text + } +} + +impl<'de> Deserialize<'de> for Data { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + use std::fmt; + + use serde::de::{self, Visitor}; + + struct StrVisitor; + + impl<'de> Visitor<'de> for StrVisitor { + type Value = Data; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("an str") + } + + fn visit_str(self, value: &str) -> Result + where + E: de::Error, + { + match value { + "text" => Ok(Data::Text), + "inner_html" => Ok(Data::InnerHtml), + s if s.starts_with("attr:") => { + let attr = s.splitn(2, ":").nth(1).unwrap(); + + Ok(Data::Attr { + attr: attr.to_string(), + }) + } + _ => Err(E::custom(format!("unknown data type: {}", value))), + } + } + } + + deserializer.deserialize_str(StrVisitor) + } +} + +impl Probe for Definition { + fn provider(&self) -> &str { + &self.name + } + + fn uri(&self, number: &str) -> String { + let mut tt = TinyTemplate::new(); + + tt.add_template("path", &self.path) + .expect("failed to add path template"); + + let context = Context { + number: number.to_string(), + }; + + tt.render("path", &context) + .expect("failed to render path template") + } + + fn fetch(&self, number: &str) -> Result { + reqwest::get(&self.uri(number)) + .map_err(|_| ())? + .text() + .map_err(|_| ()) + } + + fn parse(&self, data: &str) -> Result { + let html = Html::parse_document(data); + + let mut messages = Vec::new(); + let mut history = Vec::new(); + let mut comments = Vec::new(); + + for field in &self.messages { + for element in html.select(&field.selector) { + if let Some(data) = field.data.extract(&element) { + messages.push(data); + } + } + } + + for field in &self.history { + for element in html.select(&field.selector) { + if let Some(data) = field.data.extract(&element) { + history.push(data); + } + } + } + + for comment in &self.comments { + for comments_element in html.select(&comment.selector) { + let mut datetime: Option = None; + let mut title: Option = None; + let mut message: Option = None; + + if let Some(ref datetime_field) = comment.datetime { + for comment_element in comments_element.select(&datetime_field.field.selector) { + if let Some(data) = datetime_field.field.data.extract(&comment_element) { + // for filter in &datetime_field.field.filters {} + + let data = match datetime_field.kind { + DateTimeKind::Date => Date::date_from( + datetime_field.tz, + &data, + &datetime_field.format, + ) + .expect("failed to parse date"), + DateTimeKind::DateTime => Date::datetime_from( + datetime_field.tz, + &data, + &datetime_field.format, + ) + .expect("failed to parse date time"), + }; + + datetime = Some(data); + } + } + } + + if let Some(ref title_field) = comment.title { + for comment_element in comments_element.select(&title_field.selector) { + if let Some(data) = title_field + .data + .extract(&comment_element) + .filter(|data| !data.is_empty()) + { + // for filter in &message_field.filters {} + + title = Some(data); + } + } + } + + if let Some(ref message_field) = comment.message { + for comment_element in comments_element.select(&message_field.selector) { + if let Some(data) = message_field.data.extract(&comment_element) { + // for filter in &message_field.filters {} + + message = Some(data); + } + } + } + + if datetime.is_some() && message.is_some() { + comments.push(entry::Comment { + datetime: datetime.unwrap(), + title, + message: message.unwrap(), + }); + } + } + } + + if !messages.is_empty() || !history.is_empty() || !comments.is_empty() { + Ok(Entry { + messages, + history, + comments, + }) + } else { + Err(()) + } + } +} + +fn deserialize_selector<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + let s = String::deserialize(deserializer)?; + + Selector::parse(&s).map_err(|_| de::Error::custom("failed to parse selector")) +} + +fn deserialize_tz<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + let s = String::deserialize(deserializer)?; + + s.parse::() + .map_err(|_| de::Error::custom("failed to parse tz")) +} diff --git a/src/lib.rs b/src/lib.rs index 969f6ce..b6a94b1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,9 @@ mod context; +pub mod definition; pub mod entry; mod html; mod probe; pub use crate::context::Context; +pub use crate::definition::Definition; pub use crate::probe::*; diff --git a/src/main.rs b/src/main.rs index fe43451..912970a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,6 @@ +use std::fs; +use std::io::Read; +use std::path::PathBuf; use std::process::Command; use fern::colors::{Color, ColoredLevelConfig}; @@ -13,6 +16,9 @@ struct Opt { #[structopt(short = "o", long = "open")] open: bool, + #[structopt(short = "d", long = "definitions", parse(from_os_str))] + definitions: Vec, + number: String, } @@ -52,13 +58,27 @@ fn main() { config.apply().expect("failed to init fern"); - let mut probes: Vec> = vec![ - Box::new(Eniro), - Box::new(Hitta), - Box::new(KonsumentInfo), - Box::new(Telefonforsaljare), - Box::new(VemRingde), - ]; + let mut probes: Vec> = vec![Box::new(Hitta)]; + + let mut buffer = Vec::new(); + + for definition in &opt.definitions { + let definition = fs::File::open(&definition) + .and_then(|mut file| { + file.read_to_end(&mut buffer) + .expect("failed to read definition file"); + + let definition: Definition = + toml::from_slice(&buffer).expect("failed to parse definition file"); + + buffer.clear(); + + Ok(definition) + }) + .expect("failed to open definition file"); + + probes.push(Box::new(definition)); + } if opt.open { for probe in &mut probes { diff --git a/src/probe.rs b/src/probe.rs index 10a58c5..55828f2 100644 --- a/src/probe.rs +++ b/src/probe.rs @@ -1,19 +1,11 @@ use crate::entry::Entry; -mod eniro; mod hitta; -mod konsument_info; -mod telefonforsaljare; -mod vem_ringde; -pub use self::eniro::Eniro; pub use self::hitta::Hitta; -pub use self::konsument_info::KonsumentInfo; -pub use self::telefonforsaljare::Telefonforsaljare; -pub use self::vem_ringde::VemRingde; pub trait Probe { - fn provider(&self) -> &'static str; + fn provider(&self) -> &str; fn uri(&self, _: &str) -> String; fn fetch(&self, _: &str) -> Result;