diff --git a/.gitignore b/.gitignore index 53eaa21..b38e249 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /target -**/*.rs.bk + +*.pending-snap diff --git a/Cargo.lock b/Cargo.lock index 5db98af..e090e04 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1818,6 +1818,7 @@ dependencies = [ "fern 0.5.7 (registry+https://github.com/rust-lang/crates.io-index)", "htmlescape 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", "insta 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "reqwest 0.9.9 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/Cargo.toml b/Cargo.toml index b4440e8..c2f04ae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ chrono-tz = "0.5" directories = "1.0" fern = { version = "0.5", features = ["colored"] } htmlescape = "0.3" +lazy_static = "1.2" log = "0.4" regex = "1.1" reqwest = "0.9" diff --git a/src/html.rs b/src/html.rs new file mode 100644 index 0000000..a86f27b --- /dev/null +++ b/src/html.rs @@ -0,0 +1,37 @@ +use std::str; + +use scraper::{ElementRef, Html}; + +pub trait SelectExt { + fn element(&self) -> ElementRef; + + fn easy_text(&self) -> String { + let data = self + .element() + .text() + .map(str::trim) + .filter(|s| !s.is_empty()) + .collect::>() + .join(" "); + + htmlescape::decode_html(&data).unwrap_or(data) + } + + fn easy_inner_html(&self) -> String { + let data = self.element().inner_html(); + + htmlescape::decode_html(&data).unwrap_or(data) + } +} + +impl SelectExt for Html { + fn element(&self) -> ElementRef { + self.root_element() + } +} + +impl<'a> SelectExt for ElementRef<'a> { + fn element(&self) -> ElementRef { + *self + } +} diff --git a/src/lib.rs b/src/lib.rs index d67aca9..969f6ce 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ mod context; pub mod entry; +mod html; mod probe; pub use crate::context::Context; diff --git a/src/probe/eniro.rs b/src/probe/eniro.rs index a19a546..b3a0efa 100644 --- a/src/probe/eniro.rs +++ b/src/probe/eniro.rs @@ -1,8 +1,18 @@ +use lazy_static::lazy_static; use scraper::{Html, Selector}; use crate::entry::Entry; +use crate::html::SelectExt; use crate::probe::Probe; +lazy_static! { + static ref MESSAGE: Selector = Selector::parse(".CompanyResultListItem h3.name > a").unwrap(); + static ref HISTORY_1: Selector = + Selector::parse("div.PhoneNoHit div.search-info-container p").unwrap(); + static ref HISTORY_2: Selector = + Selector::parse("div.feedback-types div.feedback-type-item").unwrap(); +} + fn from_html(document: &str) -> Result { let html = Html::parse_document(document); @@ -10,40 +20,23 @@ fn from_html(document: &str) -> Result { let mut history = Vec::new(); let comments = Vec::new(); - let selector = Selector::parse(".CompanyResultListItem h3.name > a").unwrap(); - - if let Some(element) = html.select(&selector).next() { - let message = element.inner_html(); - let message = htmlescape::decode_html(&message).unwrap(); - + if let Some(message) = html + .select(&MESSAGE) + .next() + .map(|element| element.easy_text()) + { messages.push(message); } - let selector = Selector::parse("div.PhoneNoHit div.search-info-container p").unwrap(); - - if let Some(element) = html.select(&selector).next() { - let message = element - .text() - .map(str::trim) - .filter(|s| !s.is_empty()) - .collect::>() - .join(" "); - - let message = htmlescape::decode_html(&message).unwrap(); - + if let Some(message) = html + .select(&HISTORY_1) + .next() + .map(|element| element.easy_text()) + { history.push(message); } - let selector = Selector::parse("div.feedback-types div.feedback-type-item").unwrap(); - - for element in html.select(&selector) { - let message = element - .text() - .map(str::trim) - .filter(|s| !s.is_empty()) - .collect::>() - .join(" "); - + for message in html.select(&HISTORY_2).map(|element| element.easy_text()) { history.push(message); } diff --git a/src/probe/konsument_info.rs b/src/probe/konsument_info.rs index c29ce39..f382e3a 100644 --- a/src/probe/konsument_info.rs +++ b/src/probe/konsument_info.rs @@ -1,7 +1,13 @@ +use lazy_static::lazy_static; use scraper::{Html, Selector}; +use crate::html::SelectExt; use crate::probe::{Entry, Probe}; +lazy_static! { + static ref MESSAGE: Selector = Selector::parse(".panel-heading > h1:nth-child(3)").unwrap(); +} + fn from_html(document: &str) -> Result { let html = Html::parse_document(document); @@ -9,12 +15,11 @@ fn from_html(document: &str) -> Result { let history = Vec::new(); let comments = Vec::new(); - let selector = Selector::parse(".panel-heading > h1:nth-child(3)").unwrap(); - - if let Some(element) = html.select(&selector).next() { - let message = element.inner_html(); - let message = htmlescape::decode_html(&message).unwrap(); - + if let Some(message) = html + .select(&MESSAGE) + .next() + .map(|element| element.easy_text()) + { messages.push(message); } diff --git a/src/probe/telefonforsaljare.rs b/src/probe/telefonforsaljare.rs index 2fde524..399e222 100644 --- a/src/probe/telefonforsaljare.rs +++ b/src/probe/telefonforsaljare.rs @@ -1,52 +1,51 @@ use chrono_tz::Europe::Stockholm; +use lazy_static::lazy_static; use scraper::{Html, Selector}; use crate::entry::{Comment, Date, Entry}; +use crate::html::SelectExt; use crate::probe::Probe; +lazy_static! { + static ref MESSAGE: Selector = Selector::parse("#content p:nth-child(2) i").unwrap(); + static ref HISTORY_1: Selector = Selector::parse("#content p:nth-child(4)").unwrap(); + static ref HISTORY_2: Selector = Selector::parse("#content p:nth-child(5)").unwrap(); + static ref COMMENTS: Selector = + Selector::parse("#kommentarer > [itemtype='http://data-vocabulary.org/Review']").unwrap(); + static ref COMMENT_DATETIME: Selector = Selector::parse("small").unwrap(); + static ref COMMENT_TITLE: Selector = Selector::parse("h3").unwrap(); + static ref COMMENT_MESSAGE: Selector = Selector::parse("[itemprop='description']").unwrap(); +} + fn from_html(document: &str) -> Result { let html = Html::parse_document(document); let mut messages = Vec::new(); + let mut history = Vec::new(); + let mut comments = Vec::new(); - let selector = Selector::parse("#content p:nth-child(2) i").unwrap(); - - if let Some(element) = html.select(&selector).next() { + if let Some(element) = html.select(&MESSAGE).next() { let message = element.inner_html(); let message = htmlescape::decode_html(&message).unwrap(); messages.push(message); } - let mut history = Vec::new(); - - let selector = if messages.is_empty() { - Selector::parse("#content p:nth-child(4)").unwrap() - } else { - Selector::parse("#content p:nth-child(5)").unwrap() - }; - - if let Some(element) = html.select(&selector).next() { - let message = element - .text() - .map(str::trim) - .filter(|s| !s.is_empty()) - .collect::>() - .join(" "); - + if let Some(message) = html + .select(if messages.is_empty() { + &HISTORY_1 + } else { + &HISTORY_2 + }) + .next() + .map(|element| element.easy_text()) + { history.push(message); } - let mut comments = Vec::new(); - - let selector = - Selector::parse("#kommentarer > [itemtype='http://data-vocabulary.org/Review']").unwrap(); - - for comment in html.select(&selector) { - let selector = Selector::parse("small").unwrap(); - + for comment in html.select(&COMMENTS) { let datetime = comment - .select(&selector) + .select(&COMMENT_DATETIME) .next() .unwrap() .value() @@ -54,20 +53,22 @@ fn from_html(document: &str) -> Result { .unwrap() .to_string(); - let selector = Selector::parse("h3").unwrap(); + let title = comment + .select(&COMMENT_TITLE) + .next() + .map(|element| element.easy_inner_html()) + .filter(|title| !title.is_empty()); - let title = comment.select(&selector).next().unwrap().inner_html(); - let title = htmlescape::decode_html(&title).unwrap(); - - let selector = Selector::parse("[itemprop='description']").unwrap(); - - let message = comment.select(&selector).next().unwrap().inner_html(); - let message = htmlescape::decode_html(&message).unwrap(); + let message = comment + .select(&COMMENT_MESSAGE) + .next() + .map(|element| element.easy_inner_html()) + .unwrap_or_else(String::new); comments.push(Comment { datetime: Date::datetime_from(Stockholm, &datetime, "%Y-%m-%d %H:%M:%S") .expect("failed to parse datetime"), - title: if title.is_empty() { None } else { Some(title) }, + title, message, }); } diff --git a/src/probe/vem_ringde.rs b/src/probe/vem_ringde.rs index ee919d4..2a75ee8 100644 --- a/src/probe/vem_ringde.rs +++ b/src/probe/vem_ringde.rs @@ -1,11 +1,20 @@ use std::str; use chrono_tz::Europe::Stockholm; +use lazy_static::lazy_static; use scraper::{Html, Selector}; use crate::entry::{Comment, Date, Entry}; +use crate::html::SelectExt; use crate::probe::Probe; +lazy_static! { + static ref MESSAGE: Selector = Selector::parse("#toporganisations li").unwrap(); + static ref COMMENTS: Selector = Selector::parse("#calls ol li").unwrap(); + static ref COMMENT_DATETIME: Selector = Selector::parse("div:nth-child(4)").unwrap(); + static ref COMMENT_MESSAGE: Selector = Selector::parse("div:nth-child(3)").unwrap(); +} + fn from_html(document: &str) -> Result { let html = Html::parse_document(document); @@ -13,43 +22,22 @@ fn from_html(document: &str) -> Result { let history = Vec::new(); let mut comments = Vec::new(); - let selector = Selector::parse("#toporganisations li").unwrap(); - - for element in html.select(&selector) { - let message = element - .text() - .map(str::trim) - .filter(|s| !s.is_empty()) - .collect::>() - .join(" "); - + for message in html.select(&MESSAGE).map(|element| element.easy_text()) { messages.push(message); } - let selector = Selector::parse("#calls ol li").expect("failed to build selector"); - - for element in html.select(&selector) { - let selector = Selector::parse("div:nth-child(4)").expect("failed to build selector"); - + for element in html.select(&COMMENTS) { let date = element - .select(&selector) + .select(&COMMENT_DATETIME) .next() - .expect("failed to find datetime") - .inner_html(); - - let selector = Selector::parse("div:nth-child(3)").expect("failed to build selector"); + .map(|element| element.easy_inner_html()) + .expect("failed to find datetime"); let message = element - .select(&selector) + .select(&COMMENT_MESSAGE) .next() - .unwrap() - .text() - .map(str::trim) - .filter(|s| !s.is_empty()) - .collect::>() - .join(" "); - - let message = htmlescape::decode_html(&message).unwrap(); + .map(|element| element.easy_text()) + .unwrap_or_else(String::new); comments.push(Comment { datetime: Date::date_from(Stockholm, &date, "%Y-%m-%d").expect("failed to parse date"),