feat: add hitta.se flight-data parser as pure native-testable functions

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-06-05 14:54:04 +02:00
parent 896333254a
commit 4980beec0a
4 changed files with 736 additions and 0 deletions
+212
View File
@@ -0,0 +1,212 @@
use std::sync::LazyLock;
use regex::Regex;
use serde::{Deserialize, Serialize};
static FLIGHT_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#"(?s)self\.__next_f\.push\(\[1,"(.*?)"\]\)\s*</script>"#)
.expect("FLIGHT_RE is valid")
});
#[derive(Debug, PartialEq, Serialize)]
pub struct ParsedEntry {
pub messages: Vec<String>,
pub history: Vec<String>,
pub comments: Vec<ParsedComment>,
}
#[derive(Debug, PartialEq, Serialize)]
pub struct ParsedComment {
/// Unix epoch seconds, UTC.
pub timestamp: Option<i64>,
pub title: Option<String>,
pub message: String,
}
#[derive(Debug, PartialEq, Serialize)]
pub enum ParseError {
/// Page fetched and understood, but contains no data for the number.
NoData,
/// Page structure did not match expectations — scraper rot signal.
Failed(String),
}
#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct Statistics {
#[serde(default)]
comments: Vec<RawComment>,
statistics_text: String,
}
#[derive(Debug, Deserialize)]
struct RawComment {
comment: String,
timestamp: u64,
}
pub fn request_urls(number: &str) -> Vec<String> {
vec![format!("https://www.hitta.se/vem-ringde/{number}")]
}
pub fn parse(body: &str) -> Result<ParsedEntry, ParseError> {
let captures: Vec<&str> = FLIGHT_RE
.captures_iter(body)
.filter_map(|cap| cap.get(1).map(|m| m.as_str()))
.collect();
if captures.is_empty() {
return Err(ParseError::Failed(
"__next_f flight data not found".to_string(),
));
}
for raw_payload in &captures {
// Unescape the JSON string captured from the HTML attribute.
// We wrap it as a JSON string value so serde_json handles all escape
// sequences correctly. Literal newlines (which appear in synthetic test
// payloads) are escaped first so the JSON remains valid.
let sanitized = raw_payload.replace('\n', "\\n").replace('\r', "\\r");
let json_str = format!(r#""{sanitized}""#);
let payload: String = match serde_json::from_str(&json_str) {
Ok(s) => s,
Err(_) => continue,
};
let marker = "\"statistics\":";
let idx = match payload.find(marker) {
Some(idx) => idx,
None => continue,
};
// Found the statistics marker — deserialize or report rot.
let after_marker = &payload[idx + marker.len()..];
let mut de = serde_json::Deserializer::from_str(after_marker);
let stats = match Statistics::deserialize(&mut de) {
Ok(s) => s,
Err(err) => return Err(ParseError::Failed(err.to_string())),
};
let mut comments: Vec<ParsedComment> = stats
.comments
.into_iter()
.filter(|raw| !raw.comment.trim().is_empty())
.map(|raw| ParsedComment {
timestamp: Some((raw.timestamp / 1000) as i64),
title: None,
message: raw.comment,
})
.collect();
comments.sort_by(|a, b| b.timestamp.cmp(&a.timestamp));
return Ok(ParsedEntry {
messages: Vec::new(),
history: vec![stats.statistics_text],
comments,
});
}
Err(ParseError::NoData)
}
#[cfg(test)]
mod tests {
use super::*;
/// Build a minimal page in the App Router flight-data format.
fn flight_page(payload_json: &str) -> String {
let escaped = payload_json.replace('\\', "\\\\").replace('"', "\\\"");
format!(r#"<html><body><script>self.__next_f.push([1,"{escaped}"])</script></body></html>"#)
}
#[test]
fn requests_single_hitta_url() {
assert_eq!(
request_urls("0700000000"),
vec!["https://www.hitta.se/vem-ringde/0700000000".to_string()]
);
}
#[test]
fn parses_reported_number_fixture() {
let body = include_str!("../../../../fixtures/hitta/fresh-0104754350.html");
let entry = parse(body).unwrap();
assert_eq!(entry.messages, Vec::<String>::new());
assert_eq!(
entry.history,
vec!["Elva andra har rapporterat detta nummer"]
);
// every comment on this number has empty text -> all filtered out
assert!(entry.comments.is_empty());
}
#[test]
fn parses_low_activity_number_fixture() {
let body = include_str!("../../../../fixtures/hitta/fresh-0313908905.html");
let entry = parse(body).unwrap();
assert_eq!(
entry.history,
vec!["1000 andra har också sökt på detta nummer"]
);
assert!(entry.comments.is_empty());
}
#[test]
fn extracts_and_converts_comments() {
let page = flight_page(
r#"{"foo":{"statistics":{"searches":5,"comments":[
{"id":"a","comment":"Telefonförsäljare","time":"03 okt","timestamp":1538574919000,"upVotes":1},
{"id":"b","comment":"","time":"04 okt","timestamp":1538661319000},
{"id":"c","comment":"Bluff","time":"05 okt","timestamp":1538747719000}
],"statisticsText":"Tre rapporter"}}}"#,
);
let entry = parse(&page).unwrap();
assert_eq!(entry.history, vec!["Tre rapporter"]);
// empty-text comment filtered; newest first; millis -> seconds
assert_eq!(entry.comments.len(), 2);
assert_eq!(entry.comments[0].timestamp, Some(1538747719));
assert_eq!(entry.comments[0].message, "Bluff");
assert_eq!(entry.comments[1].timestamp, Some(1538574919));
assert_eq!(entry.comments[1].message, "Telefonförsäljare");
assert_eq!(entry.comments[0].title, None);
}
#[test]
fn flight_data_without_statistics_is_no_data() {
let page = flight_page(r#"{"someOtherComponent":{"props":{}}}"#);
assert_eq!(parse(&page), Err(ParseError::NoData));
}
#[test]
fn legacy_next_data_page_is_failed() {
// 2019 Pages Router fixture: no __next_f flight data at all
let body = include_str!("../../../../fixtures/hitta/0104754350.html");
assert!(matches!(parse(body), Err(ParseError::Failed(_))));
}
#[test]
fn garbage_is_failed() {
assert!(matches!(parse("<html></html>"), Err(ParseError::Failed(_))));
}
#[test]
fn snapshot_reported_number() {
let body = include_str!("../../../../fixtures/hitta/fresh-0104754350.html");
insta::assert_yaml_snapshot!(parse(body), @r###"
Ok:
messages: []
history:
- Elva andra har rapporterat detta nummer
comments: []
"###);
}
}