This commit is contained in:
2019-02-07 16:18:59 +01:00
parent 3edc4e411a
commit 70f89cc35c
9 changed files with 129 additions and 101 deletions

3
.gitignore vendored
View File

@@ -1,2 +1,3 @@
/target
**/*.rs.bk
*.pending-snap

1
Cargo.lock generated
View File

@@ -1818,6 +1818,7 @@ dependencies = [
"fern 0.5.7 (registry+https://github.com/rust-lang/crates.io-index)",
"htmlescape 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"insta 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"reqwest 0.9.9 (registry+https://github.com/rust-lang/crates.io-index)",

View File

@@ -11,6 +11,7 @@ chrono-tz = "0.5"
directories = "1.0"
fern = { version = "0.5", features = ["colored"] }
htmlescape = "0.3"
lazy_static = "1.2"
log = "0.4"
regex = "1.1"
reqwest = "0.9"

37
src/html.rs Normal file
View File

@@ -0,0 +1,37 @@
use std::str;
use scraper::{ElementRef, Html};
pub trait SelectExt {
fn element(&self) -> ElementRef;
fn easy_text(&self) -> String {
let data = self
.element()
.text()
.map(str::trim)
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ");
htmlescape::decode_html(&data).unwrap_or(data)
}
fn easy_inner_html(&self) -> String {
let data = self.element().inner_html();
htmlescape::decode_html(&data).unwrap_or(data)
}
}
impl SelectExt for Html {
fn element(&self) -> ElementRef {
self.root_element()
}
}
impl<'a> SelectExt for ElementRef<'a> {
fn element(&self) -> ElementRef {
*self
}
}

View File

@@ -1,5 +1,6 @@
mod context;
pub mod entry;
mod html;
mod probe;
pub use crate::context::Context;

View File

@@ -1,8 +1,18 @@
use lazy_static::lazy_static;
use scraper::{Html, Selector};
use crate::entry::Entry;
use crate::html::SelectExt;
use crate::probe::Probe;
lazy_static! {
static ref MESSAGE: Selector = Selector::parse(".CompanyResultListItem h3.name > a").unwrap();
static ref HISTORY_1: Selector =
Selector::parse("div.PhoneNoHit div.search-info-container p").unwrap();
static ref HISTORY_2: Selector =
Selector::parse("div.feedback-types div.feedback-type-item").unwrap();
}
fn from_html(document: &str) -> Result<Entry, ()> {
let html = Html::parse_document(document);
@@ -10,40 +20,23 @@ fn from_html(document: &str) -> Result<Entry, ()> {
let mut history = Vec::new();
let comments = Vec::new();
let selector = Selector::parse(".CompanyResultListItem h3.name > a").unwrap();
if let Some(element) = html.select(&selector).next() {
let message = element.inner_html();
let message = htmlescape::decode_html(&message).unwrap();
if let Some(message) = html
.select(&MESSAGE)
.next()
.map(|element| element.easy_text())
{
messages.push(message);
}
let selector = Selector::parse("div.PhoneNoHit div.search-info-container p").unwrap();
if let Some(element) = html.select(&selector).next() {
let message = element
.text()
.map(str::trim)
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ");
let message = htmlescape::decode_html(&message).unwrap();
if let Some(message) = html
.select(&HISTORY_1)
.next()
.map(|element| element.easy_text())
{
history.push(message);
}
let selector = Selector::parse("div.feedback-types div.feedback-type-item").unwrap();
for element in html.select(&selector) {
let message = element
.text()
.map(str::trim)
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ");
for message in html.select(&HISTORY_2).map(|element| element.easy_text()) {
history.push(message);
}

View File

@@ -1,7 +1,13 @@
use lazy_static::lazy_static;
use scraper::{Html, Selector};
use crate::html::SelectExt;
use crate::probe::{Entry, Probe};
lazy_static! {
static ref MESSAGE: Selector = Selector::parse(".panel-heading > h1:nth-child(3)").unwrap();
}
fn from_html(document: &str) -> Result<Entry, ()> {
let html = Html::parse_document(document);
@@ -9,12 +15,11 @@ fn from_html(document: &str) -> Result<Entry, ()> {
let history = Vec::new();
let comments = Vec::new();
let selector = Selector::parse(".panel-heading > h1:nth-child(3)").unwrap();
if let Some(element) = html.select(&selector).next() {
let message = element.inner_html();
let message = htmlescape::decode_html(&message).unwrap();
if let Some(message) = html
.select(&MESSAGE)
.next()
.map(|element| element.easy_text())
{
messages.push(message);
}

View File

@@ -1,52 +1,51 @@
use chrono_tz::Europe::Stockholm;
use lazy_static::lazy_static;
use scraper::{Html, Selector};
use crate::entry::{Comment, Date, Entry};
use crate::html::SelectExt;
use crate::probe::Probe;
lazy_static! {
static ref MESSAGE: Selector = Selector::parse("#content p:nth-child(2) i").unwrap();
static ref HISTORY_1: Selector = Selector::parse("#content p:nth-child(4)").unwrap();
static ref HISTORY_2: Selector = Selector::parse("#content p:nth-child(5)").unwrap();
static ref COMMENTS: Selector =
Selector::parse("#kommentarer > [itemtype='http://data-vocabulary.org/Review']").unwrap();
static ref COMMENT_DATETIME: Selector = Selector::parse("small").unwrap();
static ref COMMENT_TITLE: Selector = Selector::parse("h3").unwrap();
static ref COMMENT_MESSAGE: Selector = Selector::parse("[itemprop='description']").unwrap();
}
fn from_html(document: &str) -> Result<Entry, ()> {
let html = Html::parse_document(document);
let mut messages = Vec::new();
let mut history = Vec::new();
let mut comments = Vec::new();
let selector = Selector::parse("#content p:nth-child(2) i").unwrap();
if let Some(element) = html.select(&selector).next() {
if let Some(element) = html.select(&MESSAGE).next() {
let message = element.inner_html();
let message = htmlescape::decode_html(&message).unwrap();
messages.push(message);
}
let mut history = Vec::new();
let selector = if messages.is_empty() {
Selector::parse("#content p:nth-child(4)").unwrap()
if let Some(message) = html
.select(if messages.is_empty() {
&HISTORY_1
} else {
Selector::parse("#content p:nth-child(5)").unwrap()
};
if let Some(element) = html.select(&selector).next() {
let message = element
.text()
.map(str::trim)
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ");
&HISTORY_2
})
.next()
.map(|element| element.easy_text())
{
history.push(message);
}
let mut comments = Vec::new();
let selector =
Selector::parse("#kommentarer > [itemtype='http://data-vocabulary.org/Review']").unwrap();
for comment in html.select(&selector) {
let selector = Selector::parse("small").unwrap();
for comment in html.select(&COMMENTS) {
let datetime = comment
.select(&selector)
.select(&COMMENT_DATETIME)
.next()
.unwrap()
.value()
@@ -54,20 +53,22 @@ fn from_html(document: &str) -> Result<Entry, ()> {
.unwrap()
.to_string();
let selector = Selector::parse("h3").unwrap();
let title = comment
.select(&COMMENT_TITLE)
.next()
.map(|element| element.easy_inner_html())
.filter(|title| !title.is_empty());
let title = comment.select(&selector).next().unwrap().inner_html();
let title = htmlescape::decode_html(&title).unwrap();
let selector = Selector::parse("[itemprop='description']").unwrap();
let message = comment.select(&selector).next().unwrap().inner_html();
let message = htmlescape::decode_html(&message).unwrap();
let message = comment
.select(&COMMENT_MESSAGE)
.next()
.map(|element| element.easy_inner_html())
.unwrap_or_else(String::new);
comments.push(Comment {
datetime: Date::datetime_from(Stockholm, &datetime, "%Y-%m-%d %H:%M:%S")
.expect("failed to parse datetime"),
title: if title.is_empty() { None } else { Some(title) },
title,
message,
});
}

View File

@@ -1,11 +1,20 @@
use std::str;
use chrono_tz::Europe::Stockholm;
use lazy_static::lazy_static;
use scraper::{Html, Selector};
use crate::entry::{Comment, Date, Entry};
use crate::html::SelectExt;
use crate::probe::Probe;
lazy_static! {
static ref MESSAGE: Selector = Selector::parse("#toporganisations li").unwrap();
static ref COMMENTS: Selector = Selector::parse("#calls ol li").unwrap();
static ref COMMENT_DATETIME: Selector = Selector::parse("div:nth-child(4)").unwrap();
static ref COMMENT_MESSAGE: Selector = Selector::parse("div:nth-child(3)").unwrap();
}
fn from_html(document: &str) -> Result<Entry, ()> {
let html = Html::parse_document(document);
@@ -13,43 +22,22 @@ fn from_html(document: &str) -> Result<Entry, ()> {
let history = Vec::new();
let mut comments = Vec::new();
let selector = Selector::parse("#toporganisations li").unwrap();
for element in html.select(&selector) {
let message = element
.text()
.map(str::trim)
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ");
for message in html.select(&MESSAGE).map(|element| element.easy_text()) {
messages.push(message);
}
let selector = Selector::parse("#calls ol li").expect("failed to build selector");
for element in html.select(&selector) {
let selector = Selector::parse("div:nth-child(4)").expect("failed to build selector");
for element in html.select(&COMMENTS) {
let date = element
.select(&selector)
.select(&COMMENT_DATETIME)
.next()
.expect("failed to find datetime")
.inner_html();
let selector = Selector::parse("div:nth-child(3)").expect("failed to build selector");
.map(|element| element.easy_inner_html())
.expect("failed to find datetime");
let message = element
.select(&selector)
.select(&COMMENT_MESSAGE)
.next()
.unwrap()
.text()
.map(str::trim)
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ");
let message = htmlescape::decode_html(&message).unwrap();
.map(|element| element.easy_text())
.unwrap_or_else(String::new);
comments.push(Comment {
datetime: Date::date_from(Stockholm, &date, "%Y-%m-%d").expect("failed to parse date"),