feat: add hitta.se flight-data parser as pure native-testable functions

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-06-05 14:54:04 +02:00
parent 896333254a
commit 4980beec0a
4 changed files with 736 additions and 0 deletions
Generated
+519
View File
@@ -2,10 +2,529 @@
# It is not intended for manual editing.
version = 4
[[package]]
name = "aho-corasick"
version = "1.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
dependencies = [
"memchr",
]
[[package]]
name = "anyhow"
version = "1.0.102"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
[[package]]
name = "bitflags"
version = "2.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "84d7ced0ae9557296835c32bf1b1e02b44c746701f898460fb000d7eaa84f00a"
[[package]]
name = "cfg-if"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
[[package]]
name = "console"
version = "0.16.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d64e8af5551369d19cf50138de61f1c42074ab970f74e99be916646777f8fc87"
dependencies = [
"encode_unicode",
"libc",
"windows-sys",
]
[[package]]
name = "encode_unicode"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
[[package]]
name = "equivalent"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
[[package]]
name = "errno"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
dependencies = [
"libc",
"windows-sys",
]
[[package]]
name = "fastrand"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
[[package]]
name = "foldhash"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
[[package]]
name = "getrandom"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
dependencies = [
"cfg-if",
"libc",
"r-efi",
"wasip2",
"wasip3",
]
[[package]]
name = "hashbrown"
version = "0.15.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
dependencies = [
"foldhash",
]
[[package]]
name = "hashbrown"
version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a"
[[package]]
name = "heck"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "id-arena"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
[[package]]
name = "indexmap"
version = "2.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
dependencies = [
"equivalent",
"hashbrown 0.17.1",
"serde",
"serde_core",
]
[[package]]
name = "insta"
version = "1.47.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b4a6248eb93a4401ed2f37dfe8ea592d3cf05b7cf4f8efa867b6895af7e094e"
dependencies = [
"console",
"once_cell",
"serde",
"similar",
"tempfile",
]
[[package]]
name = "itoa"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
[[package]]
name = "leb128fmt"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
[[package]]
name = "libc"
version = "0.2.186"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
[[package]]
name = "linux-raw-sys"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
[[package]]
name = "log"
version = "0.4.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a"
[[package]]
name = "memchr"
version = "2.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8"
[[package]]
name = "once_cell"
version = "1.21.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
[[package]]
name = "prettyplease"
version = "0.2.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
dependencies = [
"proc-macro2",
"syn",
]
[[package]]
name = "proc-macro2"
version = "1.0.106"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
dependencies = [
"proc-macro2",
]
[[package]]
name = "r-efi"
version = "6.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
[[package]]
name = "regex"
version = "1.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
[[package]]
name = "rustix"
version = "1.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
dependencies = [
"bitflags",
"errno",
"libc",
"linux-raw-sys",
"windows-sys",
]
[[package]]
name = "semver"
version = "1.0.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
[[package]]
name = "serde"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
dependencies = [
"serde_core",
"serde_derive",
]
[[package]]
name = "serde_core"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.150"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9"
dependencies = [
"itoa",
"memchr",
"serde",
"serde_core",
"zmij",
]
[[package]]
name = "similar"
version = "2.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa"
[[package]]
name = "syn"
version = "2.0.117"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "tempfile"
version = "3.27.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd"
dependencies = [
"fastrand",
"getrandom",
"once_cell",
"rustix",
"windows-sys",
]
[[package]]
name = "unicode-ident"
version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
[[package]]
name = "unicode-xid"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
[[package]]
name = "wasip2"
version = "1.0.3+wasi-0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6"
dependencies = [
"wit-bindgen 0.57.1",
]
[[package]]
name = "wasip3"
version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
dependencies = [
"wit-bindgen 0.51.0",
]
[[package]]
name = "wasm-encoder"
version = "0.244.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319"
dependencies = [
"leb128fmt",
"wasmparser",
]
[[package]]
name = "wasm-metadata"
version = "0.244.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
dependencies = [
"anyhow",
"indexmap",
"wasm-encoder",
"wasmparser",
]
[[package]]
name = "wasmparser"
version = "0.244.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
dependencies = [
"bitflags",
"hashbrown 0.15.5",
"indexmap",
"semver",
]
[[package]]
name = "whoareyou-provider-hitta"
version = "0.1.0"
dependencies = [
"insta",
"regex",
"serde",
"serde_json",
]
[[package]]
name = "whoareyou-server"
version = "0.1.0"
[[package]]
name = "windows-link"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
[[package]]
name = "windows-sys"
version = "0.61.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
dependencies = [
"windows-link",
]
[[package]]
name = "wit-bindgen"
version = "0.51.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
dependencies = [
"wit-bindgen-rust-macro",
]
[[package]]
name = "wit-bindgen"
version = "0.57.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e"
[[package]]
name = "wit-bindgen-core"
version = "0.51.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc"
dependencies = [
"anyhow",
"heck",
"wit-parser",
]
[[package]]
name = "wit-bindgen-rust"
version = "0.51.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
dependencies = [
"anyhow",
"heck",
"indexmap",
"prettyplease",
"syn",
"wasm-metadata",
"wit-bindgen-core",
"wit-component",
]
[[package]]
name = "wit-bindgen-rust-macro"
version = "0.51.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a"
dependencies = [
"anyhow",
"prettyplease",
"proc-macro2",
"quote",
"syn",
"wit-bindgen-core",
"wit-bindgen-rust",
]
[[package]]
name = "wit-component"
version = "0.244.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
dependencies = [
"anyhow",
"bitflags",
"indexmap",
"log",
"serde",
"serde_derive",
"serde_json",
"wasm-encoder",
"wasm-metadata",
"wasmparser",
"wit-parser",
]
[[package]]
name = "wit-parser"
version = "0.244.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
dependencies = [
"anyhow",
"id-arena",
"indexmap",
"log",
"semver",
"serde",
"serde_derive",
"serde_json",
"unicode-xid",
"wasmparser",
]
[[package]]
name = "zmij"
version = "1.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
+4
View File
@@ -8,5 +8,9 @@ authors.workspace = true
crate-type = ["cdylib", "rlib"]
[dependencies]
regex = "1"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
[dev-dependencies]
insta = { version = "1.47", features = ["yaml"] }
+1
View File
@@ -1 +1,2 @@
// modules added as they are implemented
pub mod parser;
+212
View File
@@ -0,0 +1,212 @@
use std::sync::LazyLock;
use regex::Regex;
use serde::{Deserialize, Serialize};
static FLIGHT_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#"(?s)self\.__next_f\.push\(\[1,"(.*?)"\]\)\s*</script>"#)
.expect("FLIGHT_RE is valid")
});
#[derive(Debug, PartialEq, Serialize)]
pub struct ParsedEntry {
pub messages: Vec<String>,
pub history: Vec<String>,
pub comments: Vec<ParsedComment>,
}
#[derive(Debug, PartialEq, Serialize)]
pub struct ParsedComment {
/// Unix epoch seconds, UTC.
pub timestamp: Option<i64>,
pub title: Option<String>,
pub message: String,
}
#[derive(Debug, PartialEq, Serialize)]
pub enum ParseError {
/// Page fetched and understood, but contains no data for the number.
NoData,
/// Page structure did not match expectations — scraper rot signal.
Failed(String),
}
#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct Statistics {
#[serde(default)]
comments: Vec<RawComment>,
statistics_text: String,
}
#[derive(Debug, Deserialize)]
struct RawComment {
comment: String,
timestamp: u64,
}
pub fn request_urls(number: &str) -> Vec<String> {
vec![format!("https://www.hitta.se/vem-ringde/{number}")]
}
pub fn parse(body: &str) -> Result<ParsedEntry, ParseError> {
let captures: Vec<&str> = FLIGHT_RE
.captures_iter(body)
.filter_map(|cap| cap.get(1).map(|m| m.as_str()))
.collect();
if captures.is_empty() {
return Err(ParseError::Failed(
"__next_f flight data not found".to_string(),
));
}
for raw_payload in &captures {
// Unescape the JSON string captured from the HTML attribute.
// We wrap it as a JSON string value so serde_json handles all escape
// sequences correctly. Literal newlines (which appear in synthetic test
// payloads) are escaped first so the JSON remains valid.
let sanitized = raw_payload.replace('\n', "\\n").replace('\r', "\\r");
let json_str = format!(r#""{sanitized}""#);
let payload: String = match serde_json::from_str(&json_str) {
Ok(s) => s,
Err(_) => continue,
};
let marker = "\"statistics\":";
let idx = match payload.find(marker) {
Some(idx) => idx,
None => continue,
};
// Found the statistics marker — deserialize or report rot.
let after_marker = &payload[idx + marker.len()..];
let mut de = serde_json::Deserializer::from_str(after_marker);
let stats = match Statistics::deserialize(&mut de) {
Ok(s) => s,
Err(err) => return Err(ParseError::Failed(err.to_string())),
};
let mut comments: Vec<ParsedComment> = stats
.comments
.into_iter()
.filter(|raw| !raw.comment.trim().is_empty())
.map(|raw| ParsedComment {
timestamp: Some((raw.timestamp / 1000) as i64),
title: None,
message: raw.comment,
})
.collect();
comments.sort_by(|a, b| b.timestamp.cmp(&a.timestamp));
return Ok(ParsedEntry {
messages: Vec::new(),
history: vec![stats.statistics_text],
comments,
});
}
Err(ParseError::NoData)
}
#[cfg(test)]
mod tests {
use super::*;
/// Build a minimal page in the App Router flight-data format.
fn flight_page(payload_json: &str) -> String {
let escaped = payload_json.replace('\\', "\\\\").replace('"', "\\\"");
format!(r#"<html><body><script>self.__next_f.push([1,"{escaped}"])</script></body></html>"#)
}
#[test]
fn requests_single_hitta_url() {
assert_eq!(
request_urls("0700000000"),
vec!["https://www.hitta.se/vem-ringde/0700000000".to_string()]
);
}
#[test]
fn parses_reported_number_fixture() {
let body = include_str!("../../../../fixtures/hitta/fresh-0104754350.html");
let entry = parse(body).unwrap();
assert_eq!(entry.messages, Vec::<String>::new());
assert_eq!(
entry.history,
vec!["Elva andra har rapporterat detta nummer"]
);
// every comment on this number has empty text -> all filtered out
assert!(entry.comments.is_empty());
}
#[test]
fn parses_low_activity_number_fixture() {
let body = include_str!("../../../../fixtures/hitta/fresh-0313908905.html");
let entry = parse(body).unwrap();
assert_eq!(
entry.history,
vec!["1000 andra har också sökt på detta nummer"]
);
assert!(entry.comments.is_empty());
}
#[test]
fn extracts_and_converts_comments() {
let page = flight_page(
r#"{"foo":{"statistics":{"searches":5,"comments":[
{"id":"a","comment":"Telefonförsäljare","time":"03 okt","timestamp":1538574919000,"upVotes":1},
{"id":"b","comment":"","time":"04 okt","timestamp":1538661319000},
{"id":"c","comment":"Bluff","time":"05 okt","timestamp":1538747719000}
],"statisticsText":"Tre rapporter"}}}"#,
);
let entry = parse(&page).unwrap();
assert_eq!(entry.history, vec!["Tre rapporter"]);
// empty-text comment filtered; newest first; millis -> seconds
assert_eq!(entry.comments.len(), 2);
assert_eq!(entry.comments[0].timestamp, Some(1538747719));
assert_eq!(entry.comments[0].message, "Bluff");
assert_eq!(entry.comments[1].timestamp, Some(1538574919));
assert_eq!(entry.comments[1].message, "Telefonförsäljare");
assert_eq!(entry.comments[0].title, None);
}
#[test]
fn flight_data_without_statistics_is_no_data() {
let page = flight_page(r#"{"someOtherComponent":{"props":{}}}"#);
assert_eq!(parse(&page), Err(ParseError::NoData));
}
#[test]
fn legacy_next_data_page_is_failed() {
// 2019 Pages Router fixture: no __next_f flight data at all
let body = include_str!("../../../../fixtures/hitta/0104754350.html");
assert!(matches!(parse(body), Err(ParseError::Failed(_))));
}
#[test]
fn garbage_is_failed() {
assert!(matches!(parse("<html></html>"), Err(ParseError::Failed(_))));
}
#[test]
fn snapshot_reported_number() {
let body = include_str!("../../../../fixtures/hitta/fresh-0104754350.html");
insta::assert_yaml_snapshot!(parse(body), @r###"
Ok:
messages: []
history:
- Elva andra har rapporterat detta nummer
comments: []
"###);
}
}