feat: add hitta.se flight-data parser as pure native-testable functions
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -8,5 +8,9 @@ authors.workspace = true
|
||||
crate-type = ["cdylib", "rlib"]
|
||||
|
||||
[dependencies]
|
||||
regex = "1"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
|
||||
[dev-dependencies]
|
||||
insta = { version = "1.47", features = ["yaml"] }
|
||||
|
||||
@@ -1 +1,2 @@
|
||||
// modules added as they are implemented
|
||||
pub mod parser;
|
||||
|
||||
@@ -0,0 +1,212 @@
|
||||
use std::sync::LazyLock;
|
||||
|
||||
use regex::Regex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
static FLIGHT_RE: LazyLock<Regex> = LazyLock::new(|| {
|
||||
Regex::new(r#"(?s)self\.__next_f\.push\(\[1,"(.*?)"\]\)\s*</script>"#)
|
||||
.expect("FLIGHT_RE is valid")
|
||||
});
|
||||
|
||||
#[derive(Debug, PartialEq, Serialize)]
|
||||
pub struct ParsedEntry {
|
||||
pub messages: Vec<String>,
|
||||
pub history: Vec<String>,
|
||||
pub comments: Vec<ParsedComment>,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Serialize)]
|
||||
pub struct ParsedComment {
|
||||
/// Unix epoch seconds, UTC.
|
||||
pub timestamp: Option<i64>,
|
||||
pub title: Option<String>,
|
||||
pub message: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Serialize)]
|
||||
pub enum ParseError {
|
||||
/// Page fetched and understood, but contains no data for the number.
|
||||
NoData,
|
||||
/// Page structure did not match expectations — scraper rot signal.
|
||||
Failed(String),
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct Statistics {
|
||||
#[serde(default)]
|
||||
comments: Vec<RawComment>,
|
||||
statistics_text: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct RawComment {
|
||||
comment: String,
|
||||
timestamp: u64,
|
||||
}
|
||||
|
||||
pub fn request_urls(number: &str) -> Vec<String> {
|
||||
vec![format!("https://www.hitta.se/vem-ringde/{number}")]
|
||||
}
|
||||
|
||||
pub fn parse(body: &str) -> Result<ParsedEntry, ParseError> {
|
||||
let captures: Vec<&str> = FLIGHT_RE
|
||||
.captures_iter(body)
|
||||
.filter_map(|cap| cap.get(1).map(|m| m.as_str()))
|
||||
.collect();
|
||||
|
||||
if captures.is_empty() {
|
||||
return Err(ParseError::Failed(
|
||||
"__next_f flight data not found".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
for raw_payload in &captures {
|
||||
// Unescape the JSON string captured from the HTML attribute.
|
||||
// We wrap it as a JSON string value so serde_json handles all escape
|
||||
// sequences correctly. Literal newlines (which appear in synthetic test
|
||||
// payloads) are escaped first so the JSON remains valid.
|
||||
let sanitized = raw_payload.replace('\n', "\\n").replace('\r', "\\r");
|
||||
let json_str = format!(r#""{sanitized}""#);
|
||||
|
||||
let payload: String = match serde_json::from_str(&json_str) {
|
||||
Ok(s) => s,
|
||||
Err(_) => continue,
|
||||
};
|
||||
|
||||
let marker = "\"statistics\":";
|
||||
|
||||
let idx = match payload.find(marker) {
|
||||
Some(idx) => idx,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
// Found the statistics marker — deserialize or report rot.
|
||||
let after_marker = &payload[idx + marker.len()..];
|
||||
|
||||
let mut de = serde_json::Deserializer::from_str(after_marker);
|
||||
|
||||
let stats = match Statistics::deserialize(&mut de) {
|
||||
Ok(s) => s,
|
||||
Err(err) => return Err(ParseError::Failed(err.to_string())),
|
||||
};
|
||||
|
||||
let mut comments: Vec<ParsedComment> = stats
|
||||
.comments
|
||||
.into_iter()
|
||||
.filter(|raw| !raw.comment.trim().is_empty())
|
||||
.map(|raw| ParsedComment {
|
||||
timestamp: Some((raw.timestamp / 1000) as i64),
|
||||
title: None,
|
||||
message: raw.comment,
|
||||
})
|
||||
.collect();
|
||||
|
||||
comments.sort_by(|a, b| b.timestamp.cmp(&a.timestamp));
|
||||
|
||||
return Ok(ParsedEntry {
|
||||
messages: Vec::new(),
|
||||
history: vec![stats.statistics_text],
|
||||
comments,
|
||||
});
|
||||
}
|
||||
|
||||
Err(ParseError::NoData)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// Build a minimal page in the App Router flight-data format.
|
||||
fn flight_page(payload_json: &str) -> String {
|
||||
let escaped = payload_json.replace('\\', "\\\\").replace('"', "\\\"");
|
||||
format!(r#"<html><body><script>self.__next_f.push([1,"{escaped}"])</script></body></html>"#)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn requests_single_hitta_url() {
|
||||
assert_eq!(
|
||||
request_urls("0700000000"),
|
||||
vec!["https://www.hitta.se/vem-ringde/0700000000".to_string()]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parses_reported_number_fixture() {
|
||||
let body = include_str!("../../../../fixtures/hitta/fresh-0104754350.html");
|
||||
let entry = parse(body).unwrap();
|
||||
|
||||
assert_eq!(entry.messages, Vec::<String>::new());
|
||||
assert_eq!(
|
||||
entry.history,
|
||||
vec!["Elva andra har rapporterat detta nummer"]
|
||||
);
|
||||
// every comment on this number has empty text -> all filtered out
|
||||
assert!(entry.comments.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parses_low_activity_number_fixture() {
|
||||
let body = include_str!("../../../../fixtures/hitta/fresh-0313908905.html");
|
||||
let entry = parse(body).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
entry.history,
|
||||
vec!["1000 andra har också sökt på detta nummer"]
|
||||
);
|
||||
assert!(entry.comments.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extracts_and_converts_comments() {
|
||||
let page = flight_page(
|
||||
r#"{"foo":{"statistics":{"searches":5,"comments":[
|
||||
{"id":"a","comment":"Telefonförsäljare","time":"03 okt","timestamp":1538574919000,"upVotes":1},
|
||||
{"id":"b","comment":"","time":"04 okt","timestamp":1538661319000},
|
||||
{"id":"c","comment":"Bluff","time":"05 okt","timestamp":1538747719000}
|
||||
],"statisticsText":"Tre rapporter"}}}"#,
|
||||
);
|
||||
|
||||
let entry = parse(&page).unwrap();
|
||||
|
||||
assert_eq!(entry.history, vec!["Tre rapporter"]);
|
||||
// empty-text comment filtered; newest first; millis -> seconds
|
||||
assert_eq!(entry.comments.len(), 2);
|
||||
assert_eq!(entry.comments[0].timestamp, Some(1538747719));
|
||||
assert_eq!(entry.comments[0].message, "Bluff");
|
||||
assert_eq!(entry.comments[1].timestamp, Some(1538574919));
|
||||
assert_eq!(entry.comments[1].message, "Telefonförsäljare");
|
||||
assert_eq!(entry.comments[0].title, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn flight_data_without_statistics_is_no_data() {
|
||||
let page = flight_page(r#"{"someOtherComponent":{"props":{}}}"#);
|
||||
assert_eq!(parse(&page), Err(ParseError::NoData));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn legacy_next_data_page_is_failed() {
|
||||
// 2019 Pages Router fixture: no __next_f flight data at all
|
||||
let body = include_str!("../../../../fixtures/hitta/0104754350.html");
|
||||
assert!(matches!(parse(body), Err(ParseError::Failed(_))));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn garbage_is_failed() {
|
||||
assert!(matches!(parse("<html></html>"), Err(ParseError::Failed(_))));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn snapshot_reported_number() {
|
||||
let body = include_str!("../../../../fixtures/hitta/fresh-0104754350.html");
|
||||
insta::assert_yaml_snapshot!(parse(body), @r###"
|
||||
Ok:
|
||||
messages: []
|
||||
history:
|
||||
- Elva andra har rapporterat detta nummer
|
||||
comments: []
|
||||
"###);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user