Added test, but broke stuff.

This commit is contained in:
2019-01-21 15:41:53 +01:00
parent 1f63bcf85f
commit 94fa03d45c
13 changed files with 1336 additions and 238 deletions

View File

@@ -1,3 +1,5 @@
use std::fmt;
mod eniro;
mod hitta;
mod konsument_info;
@@ -12,6 +14,48 @@ pub use self::vem_ringde::VemRingde;
use crate::context::Context;
#[derive(Debug, PartialEq)]
pub struct Entry {
pub messages: Vec<String>,
pub history: Vec<String>,
pub comments: Vec<Comment>,
}
impl fmt::Display for Entry {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
if !self.messages.is_empty() {
for message in &self.messages {
writeln!(f, " {}", message)?;
}
}
if !self.history.is_empty() {
for history in &self.history {
writeln!(f, " {}", history)?;
}
}
if !self.comments.is_empty() {
for comment in &self.comments {
writeln!(
f,
" * {}: {} - {}",
comment.datetime, comment.title, comment.message
)?;
}
}
Ok(())
}
}
#[derive(Debug, PartialEq)]
pub struct Comment {
pub datetime: String,
pub title: String,
pub message: String,
}
pub trait Probe {
fn uri(&self, _: &str) -> String;
fn search(&mut self, _: &mut Context, _: &str) -> Result<(), ()>;

View File

@@ -1,21 +1,29 @@
use unhtml::FromHtml;
use unhtml_derive::FromHtml;
use scraper::{Html, Selector};
use crate::context::Context;
use crate::probe::Probe;
use crate::probe::{Entry, Probe};
#[derive(Debug, FromHtml)]
#[html(selector = ".CompanyResultListItem")]
struct Company {
#[html(selector = "h3.name > a", attr = "inner")]
name: String,
}
fn from_html(document: &str) -> Result<Entry, ()> {
let html = Html::parse_document(document);
#[derive(Debug, FromHtml)]
#[html(selector = ".PhoneNoHit")]
struct Error {
#[html(selector = ".search-info-container > p", attr = "inner")]
message: String,
let mut messages = Vec::new();
let history = Vec::new();
let comments = Vec::new();
let selector = Selector::parse(".CompanyResultListItem h3.name > a").unwrap();
if let Some(element) = html.select(&selector).next() {
let message = element.inner_html();
let message = htmlescape::decode_html(&message).unwrap();
messages.push(message);
}
Ok(Entry {
messages,
history,
comments,
})
}
pub struct Eniro;
@@ -37,18 +45,32 @@ impl Probe for Eniro {
body
};
if let Ok(company) = Company::from_html(&body) {
println!("eniro.se:");
println!(" {}", company.name);
match from_html(&body) {
Ok(entry) => {
println!("eniro.se:");
print!("{}", entry);
Ok(())
} else if let Ok(error) = Error::from_html(&body) {
println!("eniro.se:");
println!(" Antal sökningar på det här numret: {}", error.message);
Ok(())
} else {
Err(())
Ok(())
}
Err(_) => Err(()),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_0702269893() {
let document = include_str!("../../fixtures/eniro/0702269893.html");
let expected = Entry {
messages: vec!["Anonym Kund För Refill".to_string()],
history: vec![],
comments: vec![],
};
assert_eq!(from_html(&document), Ok(expected));
}
}

View File

@@ -3,7 +3,7 @@ use regex::Regex;
use serde::Deserialize;
use crate::context::Context;
use crate::probe::Probe;
use crate::probe::{self, Entry, Probe};
#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
@@ -41,6 +41,46 @@ struct Comment {
timestamp: u64,
}
fn from_html(document: &str) -> Result<Entry, ()> {
let re = Regex::new(r#"<script>__NEXT_DATA__ = (.*?);__NEXT_LOADED_PAGES__"#).unwrap();
let result = re.captures(&document).ok_or_else(|| {
debug!("Hitta: failed to find __NEXT_DATA__");
})?;
let json = result.get(1).unwrap().as_str();
if let Ok(data) = serde_json::from_str::<Data>(&json) {
let messages = Vec::new();
let mut history = Vec::new();
let mut comments = Vec::new();
if let Some(phone_data) = data.props.page_props.phone_data {
history.push(phone_data.statistics_text);
for comment in phone_data.comments {
comments.push(probe::Comment {
datetime: "".to_string(),
title: "".to_string(),
message: comment.comment,
});
}
}
Ok(Entry {
messages,
history,
comments,
})
} else {
if let Err(error) = serde_json::from_str::<Data>(&json) {
debug!("Hitta: failed to deserialize data: {:#?}", error);
}
Err(())
}
}
pub struct Hitta;
impl Probe for Hitta {
@@ -60,36 +100,32 @@ impl Probe for Hitta {
body
};
let re = Regex::new(r#"<script>__NEXT_DATA__ = (.*?);__NEXT_LOADED_PAGES__"#).unwrap();
if let Some(result) = re.captures(&body) {
let json = result.get(1).unwrap().as_str();
if let Ok(data) = serde_json::from_str::<Data>(&json) {
match from_html(&body) {
Ok(entry) => {
println!("hitta.se:");
if let Some(phone_data) = data.props.page_props.phone_data {
println!(" {}", phone_data.statistics_text);
for comment in &phone_data.comments {
println!(" * {}", comment.comment);
}
} else {
println!(" Vi hittar det mesta, men inte just den här sidan.");
}
print!("{}", entry);
Ok(())
} else {
if let Err(error) = serde_json::from_str::<Data>(&json) {
debug!("Hitta: failed to deserialize data: {:#?}", error);
}
Err(())
}
} else {
debug!("Hitta: failed to find __NEXT_DATA__");
Err(())
Err(_) => Err(()),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_0702269893() {
let document = include_str!("../../fixtures/hitta/0702269893.html");
let expected = Entry {
messages: vec![],
history: vec!["Tre andra har också sökt på detta nummer".to_string()],
comments: vec![],
};
assert_eq!(from_html(&document), Ok(expected));
}
}

View File

@@ -1,21 +1,29 @@
use unhtml::FromHtml;
use unhtml_derive::FromHtml;
use scraper::{Html, Selector};
use crate::context::Context;
use crate::probe::Probe;
use crate::probe::{Entry, Probe};
#[derive(Debug, FromHtml)]
#[html(selector = ".panel-body")]
struct Info {
#[html(selector = "h4", attr = "inner")]
message: String,
}
fn from_html(document: &str) -> Result<Entry, ()> {
let html = Html::parse_document(document);
#[derive(Debug, FromHtml)]
#[html(selector = ".body-content > .row")]
struct Error {
#[html(selector = ".col-md-12", attr = "inner")]
message: String,
let mut messages = Vec::new();
let history = Vec::new();
let comments = Vec::new();
let selector = Selector::parse(".panel-heading > h1:nth-child(3)").unwrap();
if let Some(element) = html.select(&selector).next() {
let message = element.inner_html();
let message = htmlescape::decode_html(&message).unwrap();
messages.push(message);
}
Ok(Entry {
messages,
history,
comments,
})
}
pub struct KonsumentInfo;
@@ -37,16 +45,32 @@ impl Probe for KonsumentInfo {
body
};
println!("konsumentinfo.se:");
match from_html(&body) {
Ok(entry) => {
println!("konsumentinfo.se:");
print!("{}", entry);
if let Ok(info) = Info::from_html(&body) {
println!(" {}", info.message);
} else if let Ok(error) = Error::from_html(&body) {
println!(" {}", error.message);
} else {
println!(" Failed to find any data");
Ok(())
}
Err(_) => Err(()),
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_0702269893() {
let document = include_str!("../../fixtures/konsumentinfo/0702269893.html");
let expected = Entry {
messages: vec!["Hydroscand AB".to_string()],
history: vec![],
comments: vec![],
};
assert_eq!(from_html(&document), Ok(expected));
}
}

View File

@@ -1,39 +1,70 @@
use log::debug;
use unhtml::{self, FromHtml, VecFromHtml};
use unhtml_derive::FromHtml;
use scraper::{Html, Selector};
use crate::context::Context;
use crate::probe::Probe;
use crate::probe::{Comment, Entry, Probe};
#[derive(Debug, FromHtml)]
#[html(selector = "article")]
struct Page {
content: Content,
fn from_html(document: &str) -> Result<Entry, ()> {
let html = Html::parse_document(document);
#[html(selector = "#kommentarer > [itemtype='http://data-vocabulary.org/Review']")]
comments: Vec<Comment>,
}
let mut messages = Vec::new();
#[derive(Debug, FromHtml)]
#[html(selector = "#content")]
struct Content {
#[html(selector = "p:nth-child(2)", attr = "inner", default = "")]
title: String,
let selector = Selector::parse("#content p:nth-child(2) i").unwrap();
#[html(selector = "p:nth-child(4)", attr = "inner")]
history: String,
}
if let Some(element) = html.select(&selector).next() {
let message = element.inner_html();
let message = htmlescape::decode_html(&message).unwrap();
#[derive(Debug, FromHtml)]
struct Comment {
#[html(selector = "small", attr = "datetime")]
datetime: String,
messages.push(message);
}
#[html(selector = "h3", attr = "inner")]
title: String,
let mut history = Vec::new();
#[html(selector = "[itemprop='description']", attr = "inner")]
comment: String,
let selector = Selector::parse("#content p:nth-child(5)").unwrap();
if let Some(element) = html.select(&selector).next() {
history.push(element.inner_html());
}
let mut comments = Vec::new();
let selector =
Selector::parse("#kommentarer > [itemtype='http://data-vocabulary.org/Review']").unwrap();
for comment in html.select(&selector) {
let selector = Selector::parse("small").unwrap();
let datetime = comment
.select(&selector)
.next()
.unwrap()
.value()
.attr("datetime")
.unwrap()
.to_string();
let selector = Selector::parse("h3").unwrap();
let title = comment.select(&selector).next().unwrap().inner_html();
let title = htmlescape::decode_html(&title).unwrap();
let selector = Selector::parse("[itemprop='description']").unwrap();
let message = comment.select(&selector).next().unwrap().inner_html();
let message = htmlescape::decode_html(&message).unwrap();
comments.push(Comment {
datetime,
title,
message,
});
}
Ok(Entry {
messages,
history,
comments,
})
}
pub struct Telefonforsaljare;
@@ -57,27 +88,41 @@ impl Probe for Telefonforsaljare {
println!("telefonforsaljare.nu:");
if let Ok(page) = Page::from_html(&body) {
if !page.content.title.is_empty() {
println!(" {}", page.content.title);
match from_html(&body) {
Ok(entry) => {
print!("{}", entry);
}
Err(_) => {
debug!("telefonforsaljare: failed to parse page");
println!(" {}", page.content.history);
for comment in &page.comments {
println!(
" * {}: {} - {}",
comment.datetime, comment.title, comment.comment
);
println!(" Failed to find any data");
}
} else {
if let Err(error) = Page::from_html(&body) {
debug!("telefonforsaljare: failed to parse page: {:#?}", error);
}
println!(" Failed to find any data");
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_0702269893() {
let document = include_str!("../../fixtures/telefonforsaljare/0702269893.html");
let expected = Entry {
messages: vec!["Alnö Design & Produktion AB".to_string()],
history: vec!["De senaste 24 timmarna har <strong>3 personer</strong> sökt efter numret 0702269893. Det kan tyda på att numret används av telefonförsäljare. Totalt har minst <strong>4 personer</strong> sökt efter numret.".to_string()],
comments: vec![
Comment {
datetime: "2019-01-18 14:30:55".to_string(),
title: "Alnö Design & Produktion AB".to_string(),
message: "Renhållning, service, kemprodukter".to_string(),
}
],
};
assert_eq!(from_html(&document), Ok(expected));
}
}

View File

@@ -1,36 +1,21 @@
use log::debug;
use unhtml::{self, FromHtml, VecFromHtml};
use unhtml_derive::FromHtml;
use scraper::{Html, Selector};
use crate::context::Context;
use crate::probe::Probe;
use crate::probe::{Comment, Entry, Probe};
#[derive(Debug, FromHtml)]
#[html(selector = "#content")]
struct Page {
#[html(selector = "#toporganisations > li")]
owners: Vec<Owner>,
fn from_html(document: &str) -> Result<Entry, ()> {
let _html = Html::parse_document(document);
#[html(selector = "#calls > ol.table > li")]
calls: Vec<Call>,
}
let messages = Vec::new();
let history = Vec::new();
let comments = Vec::new();
#[derive(Debug, FromHtml)]
struct Owner {
#[html(selector = "a", attr = "inner")]
title: String,
#[html(selector = "span", attr = "inner")]
calls: String,
}
#[derive(Debug, FromHtml)]
struct Call {
#[html(selector = ".w40", attr = "inner")]
who: String,
#[html(selector = ".w15", attr = "inner")]
date: String,
Ok(Entry {
messages,
history,
comments,
})
}
pub struct VemRingde;
@@ -52,8 +37,17 @@ impl Probe for VemRingde {
body
};
println!("vemringde.se:");
match from_html(&body) {
Ok(entry) => {
println!("vemringde.se:");
print!("{}", entry);
Ok(())
}
Err(_) => Err(()),
}
/*
if let Ok(page) = Page::from_html(&body) {
if !page.owners.is_empty() {
println!(" ägare:");
@@ -85,7 +79,24 @@ impl Probe for VemRingde {
println!(" Failed to find any data");
}
Err(())
*/
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_0702269893() {
let document = include_str!("../../fixtures/vemringde/0702269893.html");
let expected = Entry {
messages: vec![],
history: vec![],
comments: vec![],
};
assert_eq!(from_html(&document), Ok(expected));
}
}