4980beec0a
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
213 lines
6.6 KiB
Rust
213 lines
6.6 KiB
Rust
use std::sync::LazyLock;
|
|
|
|
use regex::Regex;
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
static FLIGHT_RE: LazyLock<Regex> = LazyLock::new(|| {
|
|
Regex::new(r#"(?s)self\.__next_f\.push\(\[1,"(.*?)"\]\)\s*</script>"#)
|
|
.expect("FLIGHT_RE is valid")
|
|
});
|
|
|
|
#[derive(Debug, PartialEq, Serialize)]
|
|
pub struct ParsedEntry {
|
|
pub messages: Vec<String>,
|
|
pub history: Vec<String>,
|
|
pub comments: Vec<ParsedComment>,
|
|
}
|
|
|
|
#[derive(Debug, PartialEq, Serialize)]
|
|
pub struct ParsedComment {
|
|
/// Unix epoch seconds, UTC.
|
|
pub timestamp: Option<i64>,
|
|
pub title: Option<String>,
|
|
pub message: String,
|
|
}
|
|
|
|
#[derive(Debug, PartialEq, Serialize)]
|
|
pub enum ParseError {
|
|
/// Page fetched and understood, but contains no data for the number.
|
|
NoData,
|
|
/// Page structure did not match expectations — scraper rot signal.
|
|
Failed(String),
|
|
}
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
#[serde(rename_all = "camelCase")]
|
|
struct Statistics {
|
|
#[serde(default)]
|
|
comments: Vec<RawComment>,
|
|
statistics_text: String,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
struct RawComment {
|
|
comment: String,
|
|
timestamp: u64,
|
|
}
|
|
|
|
pub fn request_urls(number: &str) -> Vec<String> {
|
|
vec![format!("https://www.hitta.se/vem-ringde/{number}")]
|
|
}
|
|
|
|
pub fn parse(body: &str) -> Result<ParsedEntry, ParseError> {
|
|
let captures: Vec<&str> = FLIGHT_RE
|
|
.captures_iter(body)
|
|
.filter_map(|cap| cap.get(1).map(|m| m.as_str()))
|
|
.collect();
|
|
|
|
if captures.is_empty() {
|
|
return Err(ParseError::Failed(
|
|
"__next_f flight data not found".to_string(),
|
|
));
|
|
}
|
|
|
|
for raw_payload in &captures {
|
|
// Unescape the JSON string captured from the HTML attribute.
|
|
// We wrap it as a JSON string value so serde_json handles all escape
|
|
// sequences correctly. Literal newlines (which appear in synthetic test
|
|
// payloads) are escaped first so the JSON remains valid.
|
|
let sanitized = raw_payload.replace('\n', "\\n").replace('\r', "\\r");
|
|
let json_str = format!(r#""{sanitized}""#);
|
|
|
|
let payload: String = match serde_json::from_str(&json_str) {
|
|
Ok(s) => s,
|
|
Err(_) => continue,
|
|
};
|
|
|
|
let marker = "\"statistics\":";
|
|
|
|
let idx = match payload.find(marker) {
|
|
Some(idx) => idx,
|
|
None => continue,
|
|
};
|
|
|
|
// Found the statistics marker — deserialize or report rot.
|
|
let after_marker = &payload[idx + marker.len()..];
|
|
|
|
let mut de = serde_json::Deserializer::from_str(after_marker);
|
|
|
|
let stats = match Statistics::deserialize(&mut de) {
|
|
Ok(s) => s,
|
|
Err(err) => return Err(ParseError::Failed(err.to_string())),
|
|
};
|
|
|
|
let mut comments: Vec<ParsedComment> = stats
|
|
.comments
|
|
.into_iter()
|
|
.filter(|raw| !raw.comment.trim().is_empty())
|
|
.map(|raw| ParsedComment {
|
|
timestamp: Some((raw.timestamp / 1000) as i64),
|
|
title: None,
|
|
message: raw.comment,
|
|
})
|
|
.collect();
|
|
|
|
comments.sort_by(|a, b| b.timestamp.cmp(&a.timestamp));
|
|
|
|
return Ok(ParsedEntry {
|
|
messages: Vec::new(),
|
|
history: vec![stats.statistics_text],
|
|
comments,
|
|
});
|
|
}
|
|
|
|
Err(ParseError::NoData)
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
/// Build a minimal page in the App Router flight-data format.
|
|
fn flight_page(payload_json: &str) -> String {
|
|
let escaped = payload_json.replace('\\', "\\\\").replace('"', "\\\"");
|
|
format!(r#"<html><body><script>self.__next_f.push([1,"{escaped}"])</script></body></html>"#)
|
|
}
|
|
|
|
#[test]
|
|
fn requests_single_hitta_url() {
|
|
assert_eq!(
|
|
request_urls("0700000000"),
|
|
vec!["https://www.hitta.se/vem-ringde/0700000000".to_string()]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn parses_reported_number_fixture() {
|
|
let body = include_str!("../../../../fixtures/hitta/fresh-0104754350.html");
|
|
let entry = parse(body).unwrap();
|
|
|
|
assert_eq!(entry.messages, Vec::<String>::new());
|
|
assert_eq!(
|
|
entry.history,
|
|
vec!["Elva andra har rapporterat detta nummer"]
|
|
);
|
|
// every comment on this number has empty text -> all filtered out
|
|
assert!(entry.comments.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn parses_low_activity_number_fixture() {
|
|
let body = include_str!("../../../../fixtures/hitta/fresh-0313908905.html");
|
|
let entry = parse(body).unwrap();
|
|
|
|
assert_eq!(
|
|
entry.history,
|
|
vec!["1000 andra har också sökt på detta nummer"]
|
|
);
|
|
assert!(entry.comments.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn extracts_and_converts_comments() {
|
|
let page = flight_page(
|
|
r#"{"foo":{"statistics":{"searches":5,"comments":[
|
|
{"id":"a","comment":"Telefonförsäljare","time":"03 okt","timestamp":1538574919000,"upVotes":1},
|
|
{"id":"b","comment":"","time":"04 okt","timestamp":1538661319000},
|
|
{"id":"c","comment":"Bluff","time":"05 okt","timestamp":1538747719000}
|
|
],"statisticsText":"Tre rapporter"}}}"#,
|
|
);
|
|
|
|
let entry = parse(&page).unwrap();
|
|
|
|
assert_eq!(entry.history, vec!["Tre rapporter"]);
|
|
// empty-text comment filtered; newest first; millis -> seconds
|
|
assert_eq!(entry.comments.len(), 2);
|
|
assert_eq!(entry.comments[0].timestamp, Some(1538747719));
|
|
assert_eq!(entry.comments[0].message, "Bluff");
|
|
assert_eq!(entry.comments[1].timestamp, Some(1538574919));
|
|
assert_eq!(entry.comments[1].message, "Telefonförsäljare");
|
|
assert_eq!(entry.comments[0].title, None);
|
|
}
|
|
|
|
#[test]
|
|
fn flight_data_without_statistics_is_no_data() {
|
|
let page = flight_page(r#"{"someOtherComponent":{"props":{}}}"#);
|
|
assert_eq!(parse(&page), Err(ParseError::NoData));
|
|
}
|
|
|
|
#[test]
|
|
fn legacy_next_data_page_is_failed() {
|
|
// 2019 Pages Router fixture: no __next_f flight data at all
|
|
let body = include_str!("../../../../fixtures/hitta/0104754350.html");
|
|
assert!(matches!(parse(body), Err(ParseError::Failed(_))));
|
|
}
|
|
|
|
#[test]
|
|
fn garbage_is_failed() {
|
|
assert!(matches!(parse("<html></html>"), Err(ParseError::Failed(_))));
|
|
}
|
|
|
|
#[test]
|
|
fn snapshot_reported_number() {
|
|
let body = include_str!("../../../../fixtures/hitta/fresh-0104754350.html");
|
|
insta::assert_yaml_snapshot!(parse(body), @r###"
|
|
Ok:
|
|
messages: []
|
|
history:
|
|
- Elva andra har rapporterat detta nummer
|
|
comments: []
|
|
"###);
|
|
}
|
|
}
|