use std::sync::LazyLock; use regex::Regex; use serde::{Deserialize, Serialize}; static FLIGHT_RE: LazyLock = LazyLock::new(|| { Regex::new(r#"(?s)self\.__next_f\.push\(\[1,"(.*?)"\]\)\s*"#) .expect("FLIGHT_RE is valid") }); #[derive(Debug, PartialEq, Serialize)] pub struct ParsedEntry { pub messages: Vec, pub history: Vec, pub comments: Vec, } #[derive(Debug, PartialEq, Serialize)] pub struct ParsedComment { /// Unix epoch seconds, UTC. pub timestamp: Option, pub title: Option, pub message: String, } #[derive(Debug, PartialEq, Serialize)] pub enum ParseError { /// Page fetched and understood, but contains no data for the number. NoData, /// Page structure did not match expectations — scraper rot signal. Failed(String), } #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] struct Statistics { #[serde(default)] comments: Vec, statistics_text: String, } #[derive(Debug, Deserialize)] struct RawComment { comment: String, timestamp: u64, } pub fn request_urls(number: &str) -> Vec { vec![format!("https://www.hitta.se/vem-ringde/{number}")] } pub fn parse(body: &str) -> Result { let captures: Vec<&str> = FLIGHT_RE .captures_iter(body) .filter_map(|cap| cap.get(1).map(|m| m.as_str())) .collect(); if captures.is_empty() { return Err(ParseError::Failed( "__next_f flight data not found".to_string(), )); } for raw_payload in &captures { // Unescape the JSON string captured from the HTML attribute. // We wrap it as a JSON string value so serde_json handles all escape // sequences correctly. Literal newlines (which appear in synthetic test // payloads) are escaped first so the JSON remains valid. let sanitized = raw_payload.replace('\n', "\\n").replace('\r', "\\r"); let json_str = format!(r#""{sanitized}""#); let payload: String = match serde_json::from_str(&json_str) { Ok(s) => s, Err(_) => continue, }; let marker = "\"statistics\":"; let idx = match payload.find(marker) { Some(idx) => idx, None => continue, }; // Found the statistics marker — deserialize or report rot. let after_marker = &payload[idx + marker.len()..]; let mut de = serde_json::Deserializer::from_str(after_marker); let stats = match Statistics::deserialize(&mut de) { Ok(s) => s, Err(err) => return Err(ParseError::Failed(err.to_string())), }; let mut comments: Vec = stats .comments .into_iter() .filter(|raw| !raw.comment.trim().is_empty()) .map(|raw| ParsedComment { timestamp: Some((raw.timestamp / 1000) as i64), title: None, message: raw.comment, }) .collect(); comments.sort_by(|a, b| b.timestamp.cmp(&a.timestamp)); return Ok(ParsedEntry { messages: Vec::new(), history: vec![stats.statistics_text], comments, }); } Err(ParseError::NoData) } #[cfg(test)] mod tests { use super::*; /// Build a minimal page in the App Router flight-data format. fn flight_page(payload_json: &str) -> String { let escaped = payload_json.replace('\\', "\\\\").replace('"', "\\\""); format!(r#""#) } #[test] fn requests_single_hitta_url() { assert_eq!( request_urls("0700000000"), vec!["https://www.hitta.se/vem-ringde/0700000000".to_string()] ); } #[test] fn parses_reported_number_fixture() { let body = include_str!("../../../../fixtures/hitta/fresh-0104754350.html"); let entry = parse(body).unwrap(); assert_eq!(entry.messages, Vec::::new()); assert_eq!( entry.history, vec!["Elva andra har rapporterat detta nummer"] ); // every comment on this number has empty text -> all filtered out assert!(entry.comments.is_empty()); } #[test] fn parses_low_activity_number_fixture() { let body = include_str!("../../../../fixtures/hitta/fresh-0313908905.html"); let entry = parse(body).unwrap(); assert_eq!( entry.history, vec!["1000 andra har också sökt på detta nummer"] ); assert!(entry.comments.is_empty()); } #[test] fn extracts_and_converts_comments() { let page = flight_page( r#"{"foo":{"statistics":{"searches":5,"comments":[ {"id":"a","comment":"Telefonförsäljare","time":"03 okt","timestamp":1538574919000,"upVotes":1}, {"id":"b","comment":"","time":"04 okt","timestamp":1538661319000}, {"id":"c","comment":"Bluff","time":"05 okt","timestamp":1538747719000} ],"statisticsText":"Tre rapporter"}}}"#, ); let entry = parse(&page).unwrap(); assert_eq!(entry.history, vec!["Tre rapporter"]); // empty-text comment filtered; newest first; millis -> seconds assert_eq!(entry.comments.len(), 2); assert_eq!(entry.comments[0].timestamp, Some(1538747719)); assert_eq!(entry.comments[0].message, "Bluff"); assert_eq!(entry.comments[1].timestamp, Some(1538574919)); assert_eq!(entry.comments[1].message, "Telefonförsäljare"); assert_eq!(entry.comments[0].title, None); } #[test] fn flight_data_without_statistics_is_no_data() { let page = flight_page(r#"{"someOtherComponent":{"props":{}}}"#); assert_eq!(parse(&page), Err(ParseError::NoData)); } #[test] fn legacy_next_data_page_is_failed() { // 2019 Pages Router fixture: no __next_f flight data at all let body = include_str!("../../../../fixtures/hitta/0104754350.html"); assert!(matches!(parse(body), Err(ParseError::Failed(_)))); } #[test] fn garbage_is_failed() { assert!(matches!(parse(""), Err(ParseError::Failed(_)))); } #[test] fn snapshot_reported_number() { let body = include_str!("../../../../fixtures/hitta/fresh-0104754350.html"); insta::assert_yaml_snapshot!(parse(body), @r###" Ok: messages: [] history: - Elva andra har rapporterat detta nummer comments: [] "###); } }