This commit is contained in:
2019-02-07 16:18:59 +01:00
parent 3edc4e411a
commit 70f89cc35c
9 changed files with 129 additions and 101 deletions

3
.gitignore vendored
View File

@@ -1,2 +1,3 @@
/target /target
**/*.rs.bk
*.pending-snap

1
Cargo.lock generated
View File

@@ -1818,6 +1818,7 @@ dependencies = [
"fern 0.5.7 (registry+https://github.com/rust-lang/crates.io-index)", "fern 0.5.7 (registry+https://github.com/rust-lang/crates.io-index)",
"htmlescape 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", "htmlescape 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"insta 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", "insta 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"reqwest 0.9.9 (registry+https://github.com/rust-lang/crates.io-index)", "reqwest 0.9.9 (registry+https://github.com/rust-lang/crates.io-index)",

View File

@@ -11,6 +11,7 @@ chrono-tz = "0.5"
directories = "1.0" directories = "1.0"
fern = { version = "0.5", features = ["colored"] } fern = { version = "0.5", features = ["colored"] }
htmlescape = "0.3" htmlescape = "0.3"
lazy_static = "1.2"
log = "0.4" log = "0.4"
regex = "1.1" regex = "1.1"
reqwest = "0.9" reqwest = "0.9"

37
src/html.rs Normal file
View File

@@ -0,0 +1,37 @@
use std::str;
use scraper::{ElementRef, Html};
pub trait SelectExt {
fn element(&self) -> ElementRef;
fn easy_text(&self) -> String {
let data = self
.element()
.text()
.map(str::trim)
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ");
htmlescape::decode_html(&data).unwrap_or(data)
}
fn easy_inner_html(&self) -> String {
let data = self.element().inner_html();
htmlescape::decode_html(&data).unwrap_or(data)
}
}
impl SelectExt for Html {
fn element(&self) -> ElementRef {
self.root_element()
}
}
impl<'a> SelectExt for ElementRef<'a> {
fn element(&self) -> ElementRef {
*self
}
}

View File

@@ -1,5 +1,6 @@
mod context; mod context;
pub mod entry; pub mod entry;
mod html;
mod probe; mod probe;
pub use crate::context::Context; pub use crate::context::Context;

View File

@@ -1,8 +1,18 @@
use lazy_static::lazy_static;
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use crate::entry::Entry; use crate::entry::Entry;
use crate::html::SelectExt;
use crate::probe::Probe; use crate::probe::Probe;
lazy_static! {
static ref MESSAGE: Selector = Selector::parse(".CompanyResultListItem h3.name > a").unwrap();
static ref HISTORY_1: Selector =
Selector::parse("div.PhoneNoHit div.search-info-container p").unwrap();
static ref HISTORY_2: Selector =
Selector::parse("div.feedback-types div.feedback-type-item").unwrap();
}
fn from_html(document: &str) -> Result<Entry, ()> { fn from_html(document: &str) -> Result<Entry, ()> {
let html = Html::parse_document(document); let html = Html::parse_document(document);
@@ -10,40 +20,23 @@ fn from_html(document: &str) -> Result<Entry, ()> {
let mut history = Vec::new(); let mut history = Vec::new();
let comments = Vec::new(); let comments = Vec::new();
let selector = Selector::parse(".CompanyResultListItem h3.name > a").unwrap(); if let Some(message) = html
.select(&MESSAGE)
if let Some(element) = html.select(&selector).next() { .next()
let message = element.inner_html(); .map(|element| element.easy_text())
let message = htmlescape::decode_html(&message).unwrap(); {
messages.push(message); messages.push(message);
} }
let selector = Selector::parse("div.PhoneNoHit div.search-info-container p").unwrap(); if let Some(message) = html
.select(&HISTORY_1)
if let Some(element) = html.select(&selector).next() { .next()
let message = element .map(|element| element.easy_text())
.text() {
.map(str::trim)
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ");
let message = htmlescape::decode_html(&message).unwrap();
history.push(message); history.push(message);
} }
let selector = Selector::parse("div.feedback-types div.feedback-type-item").unwrap(); for message in html.select(&HISTORY_2).map(|element| element.easy_text()) {
for element in html.select(&selector) {
let message = element
.text()
.map(str::trim)
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ");
history.push(message); history.push(message);
} }

View File

@@ -1,7 +1,13 @@
use lazy_static::lazy_static;
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use crate::html::SelectExt;
use crate::probe::{Entry, Probe}; use crate::probe::{Entry, Probe};
lazy_static! {
static ref MESSAGE: Selector = Selector::parse(".panel-heading > h1:nth-child(3)").unwrap();
}
fn from_html(document: &str) -> Result<Entry, ()> { fn from_html(document: &str) -> Result<Entry, ()> {
let html = Html::parse_document(document); let html = Html::parse_document(document);
@@ -9,12 +15,11 @@ fn from_html(document: &str) -> Result<Entry, ()> {
let history = Vec::new(); let history = Vec::new();
let comments = Vec::new(); let comments = Vec::new();
let selector = Selector::parse(".panel-heading > h1:nth-child(3)").unwrap(); if let Some(message) = html
.select(&MESSAGE)
if let Some(element) = html.select(&selector).next() { .next()
let message = element.inner_html(); .map(|element| element.easy_text())
let message = htmlescape::decode_html(&message).unwrap(); {
messages.push(message); messages.push(message);
} }

View File

@@ -1,52 +1,51 @@
use chrono_tz::Europe::Stockholm; use chrono_tz::Europe::Stockholm;
use lazy_static::lazy_static;
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use crate::entry::{Comment, Date, Entry}; use crate::entry::{Comment, Date, Entry};
use crate::html::SelectExt;
use crate::probe::Probe; use crate::probe::Probe;
lazy_static! {
static ref MESSAGE: Selector = Selector::parse("#content p:nth-child(2) i").unwrap();
static ref HISTORY_1: Selector = Selector::parse("#content p:nth-child(4)").unwrap();
static ref HISTORY_2: Selector = Selector::parse("#content p:nth-child(5)").unwrap();
static ref COMMENTS: Selector =
Selector::parse("#kommentarer > [itemtype='http://data-vocabulary.org/Review']").unwrap();
static ref COMMENT_DATETIME: Selector = Selector::parse("small").unwrap();
static ref COMMENT_TITLE: Selector = Selector::parse("h3").unwrap();
static ref COMMENT_MESSAGE: Selector = Selector::parse("[itemprop='description']").unwrap();
}
fn from_html(document: &str) -> Result<Entry, ()> { fn from_html(document: &str) -> Result<Entry, ()> {
let html = Html::parse_document(document); let html = Html::parse_document(document);
let mut messages = Vec::new(); let mut messages = Vec::new();
let mut history = Vec::new();
let mut comments = Vec::new();
let selector = Selector::parse("#content p:nth-child(2) i").unwrap(); if let Some(element) = html.select(&MESSAGE).next() {
if let Some(element) = html.select(&selector).next() {
let message = element.inner_html(); let message = element.inner_html();
let message = htmlescape::decode_html(&message).unwrap(); let message = htmlescape::decode_html(&message).unwrap();
messages.push(message); messages.push(message);
} }
let mut history = Vec::new(); if let Some(message) = html
.select(if messages.is_empty() {
let selector = if messages.is_empty() { &HISTORY_1
Selector::parse("#content p:nth-child(4)").unwrap() } else {
} else { &HISTORY_2
Selector::parse("#content p:nth-child(5)").unwrap() })
}; .next()
.map(|element| element.easy_text())
if let Some(element) = html.select(&selector).next() { {
let message = element
.text()
.map(str::trim)
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ");
history.push(message); history.push(message);
} }
let mut comments = Vec::new(); for comment in html.select(&COMMENTS) {
let selector =
Selector::parse("#kommentarer > [itemtype='http://data-vocabulary.org/Review']").unwrap();
for comment in html.select(&selector) {
let selector = Selector::parse("small").unwrap();
let datetime = comment let datetime = comment
.select(&selector) .select(&COMMENT_DATETIME)
.next() .next()
.unwrap() .unwrap()
.value() .value()
@@ -54,20 +53,22 @@ fn from_html(document: &str) -> Result<Entry, ()> {
.unwrap() .unwrap()
.to_string(); .to_string();
let selector = Selector::parse("h3").unwrap(); let title = comment
.select(&COMMENT_TITLE)
.next()
.map(|element| element.easy_inner_html())
.filter(|title| !title.is_empty());
let title = comment.select(&selector).next().unwrap().inner_html(); let message = comment
let title = htmlescape::decode_html(&title).unwrap(); .select(&COMMENT_MESSAGE)
.next()
let selector = Selector::parse("[itemprop='description']").unwrap(); .map(|element| element.easy_inner_html())
.unwrap_or_else(String::new);
let message = comment.select(&selector).next().unwrap().inner_html();
let message = htmlescape::decode_html(&message).unwrap();
comments.push(Comment { comments.push(Comment {
datetime: Date::datetime_from(Stockholm, &datetime, "%Y-%m-%d %H:%M:%S") datetime: Date::datetime_from(Stockholm, &datetime, "%Y-%m-%d %H:%M:%S")
.expect("failed to parse datetime"), .expect("failed to parse datetime"),
title: if title.is_empty() { None } else { Some(title) }, title,
message, message,
}); });
} }

View File

@@ -1,11 +1,20 @@
use std::str; use std::str;
use chrono_tz::Europe::Stockholm; use chrono_tz::Europe::Stockholm;
use lazy_static::lazy_static;
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use crate::entry::{Comment, Date, Entry}; use crate::entry::{Comment, Date, Entry};
use crate::html::SelectExt;
use crate::probe::Probe; use crate::probe::Probe;
lazy_static! {
static ref MESSAGE: Selector = Selector::parse("#toporganisations li").unwrap();
static ref COMMENTS: Selector = Selector::parse("#calls ol li").unwrap();
static ref COMMENT_DATETIME: Selector = Selector::parse("div:nth-child(4)").unwrap();
static ref COMMENT_MESSAGE: Selector = Selector::parse("div:nth-child(3)").unwrap();
}
fn from_html(document: &str) -> Result<Entry, ()> { fn from_html(document: &str) -> Result<Entry, ()> {
let html = Html::parse_document(document); let html = Html::parse_document(document);
@@ -13,43 +22,22 @@ fn from_html(document: &str) -> Result<Entry, ()> {
let history = Vec::new(); let history = Vec::new();
let mut comments = Vec::new(); let mut comments = Vec::new();
let selector = Selector::parse("#toporganisations li").unwrap(); for message in html.select(&MESSAGE).map(|element| element.easy_text()) {
for element in html.select(&selector) {
let message = element
.text()
.map(str::trim)
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ");
messages.push(message); messages.push(message);
} }
let selector = Selector::parse("#calls ol li").expect("failed to build selector"); for element in html.select(&COMMENTS) {
for element in html.select(&selector) {
let selector = Selector::parse("div:nth-child(4)").expect("failed to build selector");
let date = element let date = element
.select(&selector) .select(&COMMENT_DATETIME)
.next() .next()
.expect("failed to find datetime") .map(|element| element.easy_inner_html())
.inner_html(); .expect("failed to find datetime");
let selector = Selector::parse("div:nth-child(3)").expect("failed to build selector");
let message = element let message = element
.select(&selector) .select(&COMMENT_MESSAGE)
.next() .next()
.unwrap() .map(|element| element.easy_text())
.text() .unwrap_or_else(String::new);
.map(str::trim)
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ");
let message = htmlescape::decode_html(&message).unwrap();
comments.push(Comment { comments.push(Comment {
datetime: Date::date_from(Stockholm, &date, "%Y-%m-%d").expect("failed to parse date"), datetime: Date::date_from(Stockholm, &date, "%Y-%m-%d").expect("failed to parse date"),