From 242acff987d04b279125b4a8c4e7071aecb8ff45 Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Tue, 3 Feb 2026 20:26:57 +0000 Subject: [PATCH] Refactor and improve --- server-rs/src/consts.rs | 16 - server-rs/src/data.rs | 5 + server-rs/src/features.rs | 17 +- server-rs/src/filter.rs | 94 ----- server-rs/src/main.rs | 84 +---- server-rs/src/og_middleware.rs | 24 +- server-rs/src/parsing.rs | 5 + .../{routes/parse.rs => parsing/bounds.rs} | 32 ++ server-rs/src/parsing/filters.rs | 166 +++++++++ server-rs/src/{routes/mod.rs => routes.rs} | 3 +- server-rs/src/routes/features.rs | 67 ++-- server-rs/src/routes/hexagon_stats.rs | 335 ++++++++--------- server-rs/src/routes/hexagons.rs | 351 ++++-------------- server-rs/src/routes/og_image.rs | 9 +- server-rs/src/routes/pois.rs | 74 ++-- server-rs/src/routes/properties.rs | 86 ++--- server-rs/src/state.rs | 26 +- server-rs/src/tests.rs | 251 ------------- server-rs/src/utils.rs | 7 + server-rs/src/{ => utils}/grid_index.rs | 48 ++- server-rs/src/utils/hash.rs | 39 ++ server-rs/src/utils/interned_column.rs | 68 ++++ 22 files changed, 754 insertions(+), 1053 deletions(-) create mode 100644 server-rs/src/data.rs delete mode 100644 server-rs/src/filter.rs create mode 100644 server-rs/src/parsing.rs rename server-rs/src/{routes/parse.rs => parsing/bounds.rs} (57%) create mode 100644 server-rs/src/parsing/filters.rs rename server-rs/src/{routes/mod.rs => routes.rs} (79%) delete mode 100644 server-rs/src/tests.rs create mode 100644 server-rs/src/utils.rs rename server-rs/src/{ => utils}/grid_index.rs (83%) create mode 100644 server-rs/src/utils/hash.rs create mode 100644 server-rs/src/utils/interned_column.rs diff --git a/server-rs/src/consts.rs b/server-rs/src/consts.rs index da867b9..7fb1864 100644 --- a/server-rs/src/consts.rs +++ b/server-rs/src/consts.rs @@ -9,22 +9,6 @@ pub const SERVER_ADDRESS: &str = "0.0.0.0:8001"; pub const BOUNDS_QUANTIZATION: f64 = 0.01; pub const BOUNDS_BUFFER_PERCENT: f64 = 0.1; pub const GRID_CELL_SIZE: f32 = 0.01; -pub const POSTCODE_MIN_RESOLUTION: u8 = 11; pub const MAX_POIS_PER_REQUEST: usize = 2500; pub const DEFAULT_PROPERTIES_LIMIT: usize = 100; pub const MAX_PROPERTIES_LIMIT: usize = 500; -pub const ENUM_NULL: u8 = 255; - -/// Canonical display order for POI category groups. -/// The server will panic at startup if the data contains groups not in this list or vice versa. -pub const POI_GROUP_ORDER: &[&str] = &[ - "Public Transport", - "Amenity", - "Building", - "Craft", - "Healthcare", - "Leisure", - "Office", - "Shop", - "Tourism", -]; diff --git a/server-rs/src/data.rs b/server-rs/src/data.rs new file mode 100644 index 0000000..15db5f5 --- /dev/null +++ b/server-rs/src/data.rs @@ -0,0 +1,5 @@ +mod poi; +mod property; + +pub use poi::{POICategoryGroup, POIData}; +pub use property::{precompute_h3, Histogram, PropertyData}; diff --git a/server-rs/src/features.rs b/server-rs/src/features.rs index d25ac37..a6f4e0d 100644 --- a/server-rs/src/features.rs +++ b/server-rs/src/features.rs @@ -45,8 +45,7 @@ pub struct EnumFeatureGroup { pub features: &'static [EnumFeatureConfig], } -/// Columns in parquet that are neither numeric features nor enum features. -/// These are silently skipped during schema validation. +/// Columns in parquet that are not filterable pub const IGNORED_COLUMNS: &[&str] = &[ "lat", "lon", @@ -792,3 +791,17 @@ pub fn bounds_for(name: &str) -> Option<&'static Bounds> { .find(|feature| feature.name == name) .map(|feature| &feature.bounds) } + +/// Canonical display order for POI category groups. +/// The server will panic at startup if the data contains groups not in this list or vice versa. +pub const POI_GROUP_ORDER: &[&str] = &[ + "Public Transport", + "Amenity", + "Building", + "Craft", + "Healthcare", + "Leisure", + "Office", + "Shop", + "Tourism", +]; diff --git a/server-rs/src/filter.rs b/server-rs/src/filter.rs deleted file mode 100644 index 21551e4..0000000 --- a/server-rs/src/filter.rs +++ /dev/null @@ -1,94 +0,0 @@ -use crate::consts::ENUM_NULL; -use crate::data::EnumFeatureData; - -pub struct ParsedFilter { - pub feat_idx: usize, - pub min: f32, - pub max: f32, -} - -pub struct ParsedEnumFilter { - pub enum_idx: usize, - pub allowed: Vec, -} - -/// Parse comma-separated filter string into numeric and enum filters. -/// Numeric format: `name:min:max` -/// Enum format: `name:val1|val2|val3` (pipe-separated values) -pub fn parse_filters( - filter_str: Option<&str>, - feature_names: &[String], - enum_features: &[EnumFeatureData], -) -> (Vec, Vec) { - let mut numeric = Vec::new(); - let mut enums = Vec::new(); - - let input = match filter_str.filter(|text| !text.is_empty()) { - Some(text) => text, - None => return (numeric, enums), - }; - - for entry in input.split(',') { - let parts: Vec<&str> = entry.splitn(2, ':').collect(); - if parts.len() != 2 { - continue; - } - let name = parts[0].trim(); - let rest = parts[1].trim(); - - if let Some(enum_idx) = enum_features - .iter() - .position(|enum_feat| enum_feat.name == name) - { - let enum_feat = &enum_features[enum_idx]; - let allowed: Vec = rest - .split('|') - .filter_map(|value| { - let value = value.trim(); - enum_feat - .values - .iter() - .position(|existing| existing == value) - .map(|position| position as u8) - }) - .collect(); - enums.push(ParsedEnumFilter { enum_idx, allowed }); - } else { - let num_parts: Vec<&str> = rest.splitn(2, ':').collect(); - if num_parts.len() != 2 { - continue; - } - let min = match num_parts[0].trim().parse::() { - Ok(value) => value, - Err(_) => continue, - }; - let max = match num_parts[1].trim().parse::() { - Ok(value) => value, - Err(_) => continue, - }; - if let Some(feat_idx) = feature_names.iter().position(|feat_name| feat_name == name) { - numeric.push(ParsedFilter { feat_idx, min, max }); - } - } - } - - (numeric, enums) -} - -pub fn row_passes_filters( - row: usize, - filters: &[ParsedFilter], - enum_filters: &[ParsedEnumFilter], - feature_data: &[f32], - num_features: usize, - enum_data: &[u8], - num_enums: usize, -) -> bool { - filters.iter().all(|filter| { - let value = feature_data[row * num_features + filter.feat_idx]; - value.is_finite() && value >= filter.min && value <= filter.max - }) && enum_filters.iter().all(|enum_filter| { - let value = enum_data[row * num_enums + enum_filter.enum_idx]; - value != ENUM_NULL && enum_filter.allowed.contains(&value) - }) -} diff --git a/server-rs/src/main.rs b/server-rs/src/main.rs index 41fdcf3..ca57999 100644 --- a/server-rs/src/main.rs +++ b/server-rs/src/main.rs @@ -1,13 +1,11 @@ mod consts; mod data; mod features; -mod filter; -mod grid_index; mod og_middleware; +pub mod parsing; mod routes; mod state; -#[cfg(test)] -mod tests; +pub mod utils; use std::path::PathBuf; use std::sync::Arc; @@ -21,7 +19,7 @@ use tower_http::compression::CompressionLayer; use tower_http::cors::{Any, CorsLayer}; use tower_http::services::ServeDir; use tower_http::trace::TraceLayer; -use tracing::info; +use tracing::{info, warn}; use tracing_subscriber::EnvFilter; use state::AppState; @@ -78,12 +76,12 @@ async fn main() -> anyhow::Result<()> { info!( rows = property_data.lat.len(), features = property_data.num_features, - enums = property_data.enum_features.len(), + enums = property_data.enum_values.len(), "Property data loaded" ); info!("Building spatial grid index (0.01° cells)"); - let grid = grid_index::GridIndex::build( + let grid = utils::GridIndex::build( &property_data.lat, &property_data.lon, consts::GRID_CELL_SIZE, @@ -107,7 +105,7 @@ async fn main() -> anyhow::Result<()> { info!("Building POI spatial grid index"); let poi_grid = - grid_index::GridIndex::build(&poi_data.lat, &poi_data.lng, consts::GRID_CELL_SIZE); + utils::GridIndex::build(&poi_data.lat, &poi_data.lng, consts::GRID_CELL_SIZE); let min_keys: Vec = property_data .feature_names @@ -119,64 +117,8 @@ async fn main() -> anyhow::Result<()> { .iter() .map(|name| format!("max_{}", name)) .collect(); - let enum_min_keys: Vec = property_data - .enum_features - .iter() - .map(|enum_feature| format!("min_{}", enum_feature.name)) - .collect(); - let enum_max_keys: Vec = property_data - .enum_features - .iter() - .map(|enum_feature| format!("max_{}", enum_feature.name)) - .collect(); - // Precompute POI category groups - let poi_category_groups = { - let mut group_cats: std::collections::HashMap> = - std::collections::HashMap::new(); - let num_pois = poi_data.category.indices.len(); - for row in 0..num_pois { - let category = poi_data.category.get(row).to_string(); - let group = poi_data.group.get(row).to_string(); - group_cats.entry(group).or_default().insert(category); - } - // Validate that data groups match the hardcoded order exactly - let expected: std::collections::HashSet<&str> = - consts::POI_GROUP_ORDER.iter().copied().collect(); - let actual: std::collections::HashSet<&str> = - group_cats.keys().map(|key| key.as_str()).collect(); - let missing_from_data: Vec<&&str> = expected.difference(&actual).collect(); - let missing_from_order: Vec<&&str> = actual.difference(&expected).collect(); - if !missing_from_data.is_empty() || !missing_from_order.is_empty() { - bail!( - "POI group mismatch!\n In POI_GROUP_ORDER but not in data: {:?}\n In data but not in POI_GROUP_ORDER: {:?}", - missing_from_data, missing_from_order - ); - } - consts::POI_GROUP_ORDER - .iter() - .map(|group_name| group_name.to_string()) - .collect::>() - .into_iter() - .map(|name| { - let mut categories: Vec = group_cats - .remove(&name) - .context("POI group validated but missing from map")? - .into_iter() - .collect(); - categories.sort(); - Ok(state::POICategoryGroup { name, categories }) - }) - .collect::>>()? - }; - - // Precompute enum name → index map - let enum_name_to_idx: rustc_hash::FxHashMap = property_data - .enum_features - .iter() - .enumerate() - .map(|(index, enum_feature)| (enum_feature.name.clone(), index)) - .collect(); + let poi_category_groups = poi_data.category_groups()?; // Read index.html at startup for crawler OG injection let frontend_dist = cli.dist.unwrap_or_else(|| { @@ -200,7 +142,7 @@ async fn main() -> anyhow::Result<()> { Some(html) } Err(err) => { - tracing::warn!("Could not read index.html: {}", err); + warn!("Could not read index.html: {}", err); None } } @@ -217,6 +159,12 @@ async fn main() -> anyhow::Result<()> { ); } + let features_response = routes::build_features_response(&property_data); + info!( + groups = features_response.groups.len(), + "Precomputed features response" + ); + let state = Arc::new(AppState { data: property_data, grid, @@ -225,10 +173,8 @@ async fn main() -> anyhow::Result<()> { poi_grid, min_keys, max_keys, - enum_min_keys, - enum_max_keys, poi_category_groups, - enum_name_to_idx, + features_response, og_sidecar_url: cli.og_sidecar_url, public_url: cli.public_url, index_html, diff --git a/server-rs/src/og_middleware.rs b/server-rs/src/og_middleware.rs index 33cc28b..cc87b32 100644 --- a/server-rs/src/og_middleware.rs +++ b/server-rs/src/og_middleware.rs @@ -5,10 +5,11 @@ use axum::extract::Request; use axum::http::header; use axum::middleware::Next; use axum::response::Response; -use regex::Regex; use crate::state::AppState; +const OG_PLACEHOLDER: &str = r#""#; + pub async fn og_middleware(request: Request, next: Next) -> Response { // Capture the query string before passing the request through let query_string = request.uri().query().unwrap_or("").to_string(); @@ -46,19 +47,18 @@ pub async fn og_middleware(request: Request, next: Next) -> Response { }; let og_tags = format!( - r#"Narrowit - - - - - - -"# + r#" + + + + + + + + "# ); - // Replace the tag with title + OG meta tags - let re = Regex::new(r"<title>Narrowit").unwrap(); - let html = re.replace(index_html, og_tags.as_str()).to_string(); + let html = index_html.replace(OG_PLACEHOLDER, &og_tags); let (parts, _body) = response.into_parts(); Response::from_parts(parts, Body::from(html)) diff --git a/server-rs/src/parsing.rs b/server-rs/src/parsing.rs new file mode 100644 index 0000000..4bcfe19 --- /dev/null +++ b/server-rs/src/parsing.rs @@ -0,0 +1,5 @@ +mod bounds; +mod filters; + +pub use bounds::{h3_cell_bounds, parse_bounds}; +pub use filters::{parse_filters, row_passes_filters, ParsedEnumFilter, ParsedFilter}; diff --git a/server-rs/src/routes/parse.rs b/server-rs/src/parsing/bounds.rs similarity index 57% rename from server-rs/src/routes/parse.rs rename to server-rs/src/parsing/bounds.rs index 853766d..b7ab586 100644 --- a/server-rs/src/routes/parse.rs +++ b/server-rs/src/parsing/bounds.rs @@ -50,3 +50,35 @@ pub fn parse_bounds(bounds_str: &str) -> Result<(f64, f64, f64, f64), (StatusCod Ok((parts[0], parts[1], parts[2], parts[3])) } + +#[cfg(test)] +mod tests { + use super::*; + use std::str::FromStr; + + #[test] + fn parse_bounds_valid() { + assert_eq!(parse_bounds("1.0,2.0,3.0,4.0").unwrap(), (1.0, 2.0, 3.0, 4.0)); + assert_eq!(parse_bounds("-51.5, -0.1, 51.6, 0.2").unwrap(), (-51.5, -0.1, 51.6, 0.2)); + } + + #[test] + fn parse_bounds_invalid() { + assert!(parse_bounds("1.0,2.0,3.0").is_err()); + assert!(parse_bounds("1.0,2.0,3.0,4.0,5.0").is_err()); + assert!(parse_bounds("a,b,c,d").is_err()); + assert!(parse_bounds("").is_err()); + } + + #[test] + fn h3_cell_bounds_applies_buffer() { + let cell = h3o::CellIndex::from_str("8928308280fffff").unwrap(); + let (min_lat, min_lon, max_lat, max_lon) = h3_cell_bounds(cell, 0.0); + let (buf_min_lat, buf_min_lon, buf_max_lat, buf_max_lon) = h3_cell_bounds(cell, 0.1); + + assert!((min_lat - buf_min_lat - 0.1).abs() < 1e-10); + assert!((min_lon - buf_min_lon - 0.1).abs() < 1e-10); + assert!((buf_max_lat - max_lat - 0.1).abs() < 1e-10); + assert!((buf_max_lon - max_lon - 0.1).abs() < 1e-10); + } +} diff --git a/server-rs/src/parsing/filters.rs b/server-rs/src/parsing/filters.rs new file mode 100644 index 0000000..33ba76e --- /dev/null +++ b/server-rs/src/parsing/filters.rs @@ -0,0 +1,166 @@ +use rustc_hash::FxHashMap; + +/// Filter for numeric features: value must be in [min, max] range. +pub struct ParsedFilter { + pub feat_idx: usize, + pub min: f32, + pub max: f32, +} + +/// Filter for enum features: value must be one of the allowed indices. +pub struct ParsedEnumFilter { + pub feat_idx: usize, + pub allowed: Vec, +} + +/// Parse comma-separated filter string into numeric and enum filters. +/// Numeric format: `name:min:max` +/// Enum format: `name:val1|val2|val3` (pipe-separated string values) +pub fn parse_filters( + filter_str: Option<&str>, + feature_names: &[String], + enum_values: &FxHashMap>, +) -> (Vec, Vec) { + let mut numeric = Vec::new(); + let mut enums = Vec::new(); + + let input = match filter_str.filter(|text| !text.is_empty()) { + Some(text) => text, + None => return (numeric, enums), + }; + + for entry in input.split(',') { + let parts: Vec<&str> = entry.splitn(2, ':').collect(); + if parts.len() != 2 { + continue; + } + let name = parts[0].trim(); + let rest = parts[1].trim(); + + // Find feature index by name + let Some(feat_idx) = feature_names.iter().position(|feat_name| feat_name == name) else { + continue; + }; + + // Check if this is an enum feature + if let Some(values) = enum_values.get(&feat_idx) { + // Enum filter: convert string values to f32 indices + let allowed: Vec = rest + .split('|') + .filter_map(|value| { + let value = value.trim(); + values + .iter() + .position(|existing| existing == value) + .map(|position| position as f32) + }) + .collect(); + enums.push(ParsedEnumFilter { feat_idx, allowed }); + } else { + // Numeric filter: parse min:max + let num_parts: Vec<&str> = rest.splitn(2, ':').collect(); + if num_parts.len() != 2 { + continue; + } + let min = match num_parts[0].trim().parse::() { + Ok(value) => value, + Err(_) => continue, + }; + let max = match num_parts[1].trim().parse::() { + Ok(value) => value, + Err(_) => continue, + }; + numeric.push(ParsedFilter { feat_idx, min, max }); + } + } + + (numeric, enums) +} + +/// Check if a row passes all filters. +/// All features (numeric and enum) are stored in feature_data as f32. +pub fn row_passes_filters( + row: usize, + filters: &[ParsedFilter], + enum_filters: &[ParsedEnumFilter], + feature_data: &[f32], + num_features: usize, +) -> bool { + let base = row * num_features; + + filters.iter().all(|filter| { + let value = feature_data[base + filter.feat_idx]; + value.is_finite() && value >= filter.min && value <= filter.max + }) && enum_filters.iter().all(|filter| { + let value = feature_data[base + filter.feat_idx]; + value.is_finite() && filter.allowed.contains(&value) + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn feature_names() -> Vec { + vec!["price".into(), "area".into(), "rating".into()] + } + + fn enum_values() -> FxHashMap> { + let mut map = FxHashMap::default(); + map.insert(2, vec!["A".into(), "B".into(), "C".into()]); + map + } + + #[test] + fn parse_filters_numeric() { + let (numeric, enums) = parse_filters(Some("price:100:500"), &feature_names(), &enum_values()); + assert_eq!(numeric.len(), 1); + assert_eq!(numeric[0].feat_idx, 0); + assert_eq!(numeric[0].min, 100.0); + assert_eq!(numeric[0].max, 500.0); + assert!(enums.is_empty()); + } + + #[test] + fn parse_filters_enum() { + let (numeric, enums) = parse_filters(Some("rating:A|C"), &feature_names(), &enum_values()); + assert!(numeric.is_empty()); + assert_eq!(enums.len(), 1); + assert_eq!(enums[0].feat_idx, 2); + assert_eq!(enums[0].allowed, vec![0.0, 2.0]); + } + + #[test] + fn parse_filters_empty_and_invalid() { + let (n, e) = parse_filters(None, &feature_names(), &enum_values()); + assert!(n.is_empty() && e.is_empty()); + + let (n, e) = parse_filters(Some(""), &feature_names(), &enum_values()); + assert!(n.is_empty() && e.is_empty()); + + let (n, e) = parse_filters(Some("unknown:1:2"), &feature_names(), &enum_values()); + assert!(n.is_empty() && e.is_empty()); + } + + #[test] + fn row_passes_numeric_filter() { + let filters = vec![ParsedFilter { feat_idx: 0, min: 10.0, max: 20.0 }]; + let data = vec![15.0, 5.0, f32::NAN]; + + assert!(row_passes_filters(0, &filters, &[], &data, 1)); + assert!(!row_passes_filters(1, &filters, &[], &data, 1)); + assert!(!row_passes_filters(2, &filters, &[], &data, 1)); // NaN fails + } + + #[test] + fn row_passes_enum_filter() { + let filters = vec![ParsedEnumFilter { feat_idx: 0, allowed: vec![0.0, 2.0] }]; + // Row 0: value 0.0 (allowed), Row 1: value 1.0 (not allowed), Row 2: value 2.0 (allowed), Row 3: NaN (fails) + let data = vec![0.0, 1.0, 2.0, f32::NAN]; + + assert!(row_passes_filters(0, &[], &filters, &data, 1)); + assert!(!row_passes_filters(1, &[], &filters, &data, 1)); + assert!(row_passes_filters(2, &[], &filters, &data, 1)); + assert!(!row_passes_filters(3, &[], &filters, &data, 1)); // NaN fails + } +} diff --git a/server-rs/src/routes/mod.rs b/server-rs/src/routes.rs similarity index 79% rename from server-rs/src/routes/mod.rs rename to server-rs/src/routes.rs index 10c293a..3ee7d9f 100644 --- a/server-rs/src/routes/mod.rs +++ b/server-rs/src/routes.rs @@ -2,11 +2,10 @@ mod features; mod hexagon_stats; pub(crate) mod hexagons; mod og_image; -pub(crate) mod parse; mod pois; pub(crate) mod properties; -pub use features::get_features; +pub use features::{build_features_response, get_features, FeaturesResponse}; pub use hexagon_stats::get_hexagon_stats; pub use hexagons::get_hexagons; pub use og_image::get_og_image; diff --git a/server-rs/src/routes/features.rs b/server-rs/src/routes/features.rs index a47f86f..b434b7d 100644 --- a/server-rs/src/routes/features.rs +++ b/server-rs/src/routes/features.rs @@ -4,11 +4,11 @@ use axum::response::Json; use serde::Serialize; use tracing::info; -use crate::data::Histogram; +use crate::data::{Histogram, PropertyData}; use crate::features::{ENUM_FEATURE_GROUPS, FEATURE_GROUPS}; use crate::state::AppState; -#[derive(Serialize)] +#[derive(Clone, Serialize)] #[serde(tag = "type")] pub enum FeatureInfo { #[serde(rename = "numeric")] @@ -32,18 +32,19 @@ pub enum FeatureInfo { }, } -#[derive(Serialize)] +#[derive(Clone, Serialize)] pub struct FeatureGroupResponse { name: String, features: Vec, } -#[derive(Serialize)] +#[derive(Clone, Serialize)] pub struct FeaturesResponse { - groups: Vec, + pub groups: Vec, } -pub async fn get_features(state: Arc) -> Json { +/// Build the features response at startup. Called once and cached in AppState. +pub fn build_features_response(data: &PropertyData) -> FeaturesResponse { // Collect all group names in order, merging numeric and enum groups with the same name let mut group_names: Vec<&str> = Vec::new(); for feature_group in FEATURE_GROUPS { @@ -66,13 +67,12 @@ pub async fn get_features(state: Arc) -> Json { for feature_group in FEATURE_GROUPS { if feature_group.name == group_name { for feature_config in feature_group.features { - if let Some(feat_idx) = state - .data + if let Some(feat_idx) = data .feature_names .iter() .position(|feat_name| feat_name == feature_config.name) { - let stats = &state.data.feature_stats[feat_idx]; + let stats = &data.feature_stats[feat_idx]; features.push(FeatureInfo::Numeric { name: feature_config.name.to_string(), min: stats.slider_min, @@ -92,19 +92,22 @@ pub async fn get_features(state: Arc) -> Json { for enum_group in ENUM_FEATURE_GROUPS { if enum_group.name == group_name { for enum_config in enum_group.features { - if let Some(enum_feature) = state - .data - .enum_features + // Find the feature index by name + if let Some(feat_idx) = data + .feature_names .iter() - .find(|enum_feat| enum_feat.name == enum_config.name) + .position(|name| name == enum_config.name) { - features.push(FeatureInfo::Enum { - name: enum_config.name.to_string(), - values: enum_feature.values.clone(), - description: enum_config.description, - detail: enum_config.detail, - source: enum_config.source, - }); + // Check if this feature has enum values + if let Some(values) = data.enum_values.get(&feat_idx) { + features.push(FeatureInfo::Enum { + name: enum_config.name.to_string(), + values: values.clone(), + description: enum_config.description, + detail: enum_config.detail, + source: enum_config.source, + }); + } } } } @@ -118,22 +121,10 @@ pub async fn get_features(state: Arc) -> Json { } } - let num_numeric: usize = groups - .iter() - .flat_map(|group| &group.features) - .filter(|feature| matches!(feature, FeatureInfo::Numeric { .. })) - .count(); - let num_enum: usize = groups - .iter() - .flat_map(|group| &group.features) - .filter(|feature| matches!(feature, FeatureInfo::Enum { .. })) - .count(); - - info!( - numeric = num_numeric, - enums = num_enum, - groups = groups.len(), - "GET /api/features" - ); - Json(FeaturesResponse { groups }) + FeaturesResponse { groups } +} + +pub async fn get_features(state: Arc) -> Json { + info!("GET /api/features"); + Json(state.features_response.clone()) } diff --git a/server-rs/src/routes/hexagon_stats.rs b/server-rs/src/routes/hexagon_stats.rs index bdd65b7..937c63c 100644 --- a/server-rs/src/routes/hexagon_stats.rs +++ b/server-rs/src/routes/hexagon_stats.rs @@ -1,18 +1,50 @@ -use std::fmt::Write; +use std::collections::HashMap; use std::str::FromStr; use std::sync::Arc; use axum::extract::Query; use axum::http::StatusCode; -use axum::response::IntoResponse; -use serde::Deserialize; +use axum::response::Json; +use serde::{Deserialize, Serialize}; use tracing::{info, warn}; -use crate::consts::{ENUM_NULL, H3_PRECOMPUTE_MAX, H3_REQUEST_MAX, H3_REQUEST_MIN, HISTOGRAM_BINS}; -use crate::filter::{parse_filters, row_passes_filters}; +use crate::consts::{H3_PRECOMPUTE_MAX, H3_REQUEST_MAX, H3_REQUEST_MIN, HISTOGRAM_BINS}; +use crate::parsing::{h3_cell_bounds, parse_filters, row_passes_filters}; use crate::state::AppState; -use super::parse::h3_cell_bounds; +#[derive(Serialize)] +pub struct HistogramStats { + min: f64, + max: f64, + /// 1st percentile (left edge of main distribution) + p1: f64, + /// 99th percentile (right edge of main distribution) + p99: f64, + counts: Vec, +} + +#[derive(Serialize)] +pub struct NumericFeatureStats { + name: String, + count: usize, + min: f64, + max: f64, + mean: f64, + histogram: HistogramStats, +} + +#[derive(Serialize)] +pub struct EnumFeatureStats { + name: String, + counts: HashMap, +} + +#[derive(Serialize)] +pub struct HexagonStatsResponse { + count: usize, + numeric_features: Vec, + enum_features: Vec, +} #[derive(Deserialize)] pub struct HexagonStatsParams { @@ -20,15 +52,14 @@ pub struct HexagonStatsParams { pub resolution: u8, pub filters: Option, /// Comma-separated feature names to include in stats response. - /// When present (even if empty), only listed features are computed. - /// When absent, all features are returned (backward compatible). + /// Only listed features are computed; if absent or empty, no features are returned. pub fields: Option, } pub async fn get_hexagon_stats( state: Arc, Query(params): Query, -) -> Result { +) -> Result, (StatusCode, String)> { let cell = h3o::CellIndex::from_str(¶ms.h3).map_err(|error| { warn!(h3 = %params.h3, error = %error, "Invalid H3 cell index"); ( @@ -57,36 +88,34 @@ pub async fn get_hexagon_stats( let (parsed_filters, parsed_enum_filters) = parse_filters( params.filters.as_deref(), &state.data.feature_names, - &state.data.enum_features, + &state.data.enum_values, ); let num_filters = parsed_filters.len() + parsed_enum_filters.len(); - // Parse optional `fields` param into sets of feature names. - // None = include all, Some = only include listed features. - let field_set: Option> = - params.fields.as_ref().map(|fields_str| { + let fields_specified = params.fields.is_some(); + let field_set: std::collections::HashSet = params + .fields + .as_ref() + .map(|fields_str| { fields_str .split(',') .map(|field| field.trim().to_string()) .filter(|field| !field.is_empty()) .collect() - }); + }) + .unwrap_or_default(); - let result = tokio::task::spawn_blocking(move || { + let response = tokio::task::spawn_blocking(move || { let start_time = std::time::Instant::now(); let precomputed = &state.h3_cells; let h3_res = h3o::Resolution::try_from(resolution) .map_err(|err| format!("Invalid H3 resolution {}: {}", resolution, err))?; let need_parent = resolution < H3_PRECOMPUTE_MAX; let num_features = state.data.num_features; - let num_enums = state.data.num_enums; let feature_data = &state.data.feature_data; - let enum_data = &state.data.enum_data; - let enum_features = &state.data.enum_features; let (min_lat, min_lon, max_lat, max_lon) = h3_cell_bounds(cell, 0.001); - // Resolve cell at requested resolution from precomputed max-resolution cell let cell_for_row = |row: usize| -> u64 { let max_cell = precomputed[row]; if !need_parent || max_cell == 0 { @@ -99,7 +128,6 @@ pub async fn get_hexagon_stats( .unwrap_or(0) }; - // Collect matching rows let mut matching_rows: Vec = Vec::new(); state .grid @@ -112,8 +140,6 @@ pub async fn get_hexagon_stats( &parsed_enum_filters, feature_data, num_features, - enum_data, - num_enums, ) { matching_rows.push(row); @@ -122,140 +148,108 @@ pub async fn get_hexagon_stats( let total_count = matching_rows.len(); - // Build JSON directly via string buffer - let mut output = String::with_capacity(4096); - output.push_str("{\"count\":"); - write!(output, "{}", total_count).unwrap(); + let mut numeric_features = Vec::new(); + let mut enum_features_out = Vec::new(); - // Numeric features: compute count, min, max, sum, histogram using global bin edges - output.push_str(",\"numeric_features\":["); - let mut first_numeric = true; for (feature_index, feature_name) in state.data.feature_names.iter().enumerate() { - // Skip features not in the requested set (when fields param is present) - if let Some(ref set) = field_set { - if !set.contains(feature_name.as_str()) { - continue; - } - } - let global_stats = &state.data.feature_stats[feature_index]; - let histogram_min = global_stats.histogram.min; - let histogram_max = global_stats.histogram.max; - let bin_width = global_stats.histogram.bin_width; - - let mut count = 0usize; - let mut min_value = f32::INFINITY; - let mut max_value = f32::NEG_INFINITY; - let mut sum = 0.0f64; // keep f64 for mean precision - let mut bins = vec![0u64; HISTOGRAM_BINS]; - - for &row in &matching_rows { - let value = feature_data[row * num_features + feature_index]; - if value.is_finite() { - count += 1; - if value < min_value { - min_value = value; - } - if value > max_value { - max_value = value; - } - sum += value as f64; - - // Bin into histogram using global edges (cast to f64 for bin index math) - if bin_width > 0.0 { - let bin_index = ((value as f64 - histogram_min as f64) / bin_width as f64) - .floor() as isize; - let clamped_index = - bin_index.max(0).min((HISTOGRAM_BINS - 1) as isize) as usize; - bins[clamped_index] += 1; - } - } - } - - if count == 0 { + if fields_specified && !field_set.contains(feature_name.as_str()) { continue; } - if !first_numeric { - output.push(','); - } - first_numeric = false; - - let mean = sum / count as f64; - output.push_str("{\"name\":"); - write_json_string(&mut output, feature_name); - write!(output, ",\"count\":{}", count).unwrap(); - write!(output, ",\"min\":{}", format_num(min_value)).unwrap(); - write!(output, ",\"max\":{}", format_num(max_value)).unwrap(); - write!(output, ",\"mean\":{}", format_f64(mean)).unwrap(); - output.push_str(",\"histogram\":{\"min\":"); - write!(output, "{}", format_num(histogram_min)).unwrap(); - output.push_str(",\"max\":"); - write!(output, "{}", format_num(histogram_max)).unwrap(); - output.push_str(",\"bin_width\":"); - write!(output, "{}", format_num(bin_width)).unwrap(); - output.push_str(",\"counts\":["); - for (bin_index, &bin_count) in bins.iter().enumerate() { - if bin_index > 0 { - output.push(','); + // Check if this is an enum feature + if let Some(enum_values) = state.data.enum_values.get(&feature_index) { + // Enum feature: count occurrences of each value + let mut value_counts = vec![0u64; enum_values.len()]; + for &row in &matching_rows { + let value = feature_data[row * num_features + feature_index]; + if value.is_finite() { + let idx = value as usize; + if idx < value_counts.len() { + value_counts[idx] += 1; + } + } + } + + let counts: HashMap = value_counts + .iter() + .enumerate() + .filter(|(_, &count)| count > 0) + .map(|(idx, &count)| (enum_values[idx].clone(), count)) + .collect(); + + if !counts.is_empty() { + enum_features_out.push(EnumFeatureStats { + name: feature_name.clone(), + counts, + }); + } + } else { + // Numeric feature: compute stats and histogram + let global_hist = &state.data.feature_stats[feature_index].histogram; + let p1 = global_hist.p1; + let p99 = global_hist.p99; + + let mut count = 0usize; + let mut min_value = f32::INFINITY; + let mut max_value = f32::NEG_INFINITY; + let mut sum = 0.0f64; + let mut bins = vec![0u64; HISTOGRAM_BINS]; + + // Compute middle bin width (between p1 and p99) + let middle_bins = HISTOGRAM_BINS.saturating_sub(2); + let middle_width = if middle_bins > 0 && p99 > p1 { + (p99 - p1) / middle_bins as f32 + } else { + 0.0 + }; + + for &row in &matching_rows { + let value = feature_data[row * num_features + feature_index]; + if value.is_finite() { + count += 1; + if value < min_value { + min_value = value; + } + if value > max_value { + max_value = value; + } + sum += value as f64; + + // Bin using p1/p99 outlier structure + let bin = if value < p1 { + 0 // Low outlier bin + } else if value >= p99 { + HISTOGRAM_BINS - 1 // High outlier bin + } else if middle_width > 0.0 { + // Middle bins (1 to n-2) + let middle_bin = ((value - p1) / middle_width) as usize; + (1 + middle_bin).min(HISTOGRAM_BINS - 2) + } else { + HISTOGRAM_BINS / 2 // Fallback if p1 == p99 + }; + bins[bin] += 1; + } + } + + if count > 0 { + numeric_features.push(NumericFeatureStats { + name: feature_name.clone(), + count, + min: min_value as f64, + max: max_value as f64, + mean: sum / count as f64, + histogram: HistogramStats { + min: global_hist.min as f64, + max: global_hist.max as f64, + p1: p1 as f64, + p99: p99 as f64, + counts: bins, + }, + }); } - write!(output, "{}", bin_count).unwrap(); } - output.push_str("]}}") } - // Enum features: count per value - output.push_str("],\"enum_features\":["); - let mut first_enum = true; - for enum_feature in enum_features { - // Skip enum features not in the requested set - if let Some(ref set) = field_set { - if !set.contains(enum_feature.name.as_str()) { - continue; - } - } - let enum_index = match state.enum_name_to_idx.get(&enum_feature.name) { - Some(&index) => index, - None => continue, - }; - - let mut value_counts = vec![0u64; enum_feature.values.len()]; - for &row in &matching_rows { - let value = enum_data[row * num_enums + enum_index]; - if value != ENUM_NULL && (value as usize) < value_counts.len() { - value_counts[value as usize] += 1; - } - } - - // Only include if there are any non-zero counts - let has_values = value_counts.iter().any(|&count| count > 0); - if !has_values { - continue; - } - - if !first_enum { - output.push(','); - } - first_enum = false; - - output.push_str("{\"name\":"); - write_json_string(&mut output, &enum_feature.name); - output.push_str(",\"counts\":{"); - let mut first_value = true; - for (value_index, &count) in value_counts.iter().enumerate() { - if count == 0 { - continue; - } - if !first_value { - output.push(','); - } - first_value = false; - write_json_string(&mut output, &enum_feature.values[value_index]); - write!(output, ":{}", count).unwrap(); - } - output.push_str("}}"); - } - output.push_str("]}"); - let elapsed = start_time.elapsed(); info!( h3 = %h3_str, @@ -267,46 +261,15 @@ pub async fn get_hexagon_stats( "GET /api/hexagon-stats" ); - Ok(output) + Ok(HexagonStatsResponse { + count: total_count, + numeric_features, + enum_features: enum_features_out, + }) }) .await .map_err(|error| (StatusCode::INTERNAL_SERVER_ERROR, error.to_string()))? - .map_err(|error| (StatusCode::INTERNAL_SERVER_ERROR, error))?; + .map_err(|error: String| (StatusCode::INTERNAL_SERVER_ERROR, error))?; - Ok(( - [(axum::http::header::CONTENT_TYPE, "application/json")], - result, - )) -} - -fn write_json_string(output: &mut String, value: &str) { - output.push('"'); - for character in value.chars() { - match character { - '"' => output.push_str("\\\""), - '\\' => output.push_str("\\\\"), - '\n' => output.push_str("\\n"), - '\r' => output.push_str("\\r"), - '\t' => output.push_str("\\t"), - other => output.push(other), - } - } - output.push('"'); -} - -fn format_num(value: f32) -> String { - let fv = value as f64; - if fv.fract() == 0.0 && fv.abs() < 1e15 { - format!("{:.1}", fv) - } else { - format!("{}", fv) - } -} - -fn format_f64(value: f64) -> String { - if value.fract() == 0.0 && value.abs() < 1e15 { - format!("{:.1}", value) - } else { - format!("{}", value) - } + Ok(Json(response)) } diff --git a/server-rs/src/routes/hexagons.rs b/server-rs/src/routes/hexagons.rs index 4fd8149..f73141c 100644 --- a/server-rs/src/routes/hexagons.rs +++ b/server-rs/src/routes/hexagons.rs @@ -1,35 +1,22 @@ -use std::fmt::{self, Write}; use std::sync::Arc; use axum::extract::Query; use axum::http::StatusCode; -use axum::response::IntoResponse; +use axum::response::Json; use rustc_hash::FxHashMap; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; +use serde_json::{Map, Value}; use tracing::{info, warn}; use crate::consts::{ - BOUNDS_BUFFER_PERCENT, BOUNDS_QUANTIZATION, ENUM_NULL, H3_PRECOMPUTE_MAX, H3_REQUEST_MAX, - H3_REQUEST_MIN, POSTCODE_MIN_RESOLUTION, + BOUNDS_BUFFER_PERCENT, BOUNDS_QUANTIZATION, H3_PRECOMPUTE_MAX, H3_REQUEST_MAX, H3_REQUEST_MIN, }; -use crate::filter::parse_filters; +use crate::parsing::{parse_bounds, parse_filters, row_passes_filters}; use crate::state::AppState; -use super::parse::parse_bounds; - -struct HumanBytes(usize); - -impl fmt::Display for HumanBytes { - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - let bytes = self.0; - if bytes >= 1_000_000 { - write!(formatter, "{:.1} MB", bytes as f64 / 1_000_000.0) - } else if bytes >= 1_000 { - write!(formatter, "{:.1} KB", bytes as f64 / 1_000.0) - } else { - write!(formatter, "{} B", bytes) - } - } +#[derive(Serialize)] +pub struct HexagonsResponse { + features: Vec>, } #[derive(Deserialize)] @@ -51,28 +38,14 @@ struct CellAgg { count: u32, mins: Box<[f32]>, maxs: Box<[f32]>, - /// Min/max ordinal indices for enum features (255 = no data yet) - enum_mins: Box<[u8]>, - enum_maxs: Box<[u8]>, - /// Most common postcode in this cell (only tracked at high resolutions) - postcode: Option, - postcode_count: u32, - lat_sum: f64, - lon_sum: f64, } impl CellAgg { - fn new(num_features: usize, num_enums: usize) -> Self { + fn new(num_features: usize) -> Self { CellAgg { count: 0, mins: vec![f32::INFINITY; num_features].into_boxed_slice(), maxs: vec![f32::NEG_INFINITY; num_features].into_boxed_slice(), - enum_mins: vec![ENUM_NULL; num_enums].into_boxed_slice(), - enum_maxs: vec![0; num_enums].into_boxed_slice(), - postcode: None, - postcode_count: 0, - lat_sum: 0.0, - lon_sum: 0.0, } } @@ -96,23 +69,6 @@ impl CellAgg { } } - /// Track min/max ordinal index for each enum feature in this cell. - #[inline] - fn add_enums(&mut self, enum_data: &[u8], row: usize, num_enums: usize) { - let base = row * num_enums; - let row_slice = &enum_data[base..base + num_enums]; - for (enum_index, &value) in row_slice.iter().enumerate() { - if value != ENUM_NULL { - if self.enum_mins[enum_index] == ENUM_NULL || value < self.enum_mins[enum_index] { - self.enum_mins[enum_index] = value; - } - if value > self.enum_maxs[enum_index] { - self.enum_maxs[enum_index] = value; - } - } - } - } - /// Add a row, only aggregating the features at the given indices. #[inline] fn add_row_selective( @@ -136,178 +92,57 @@ impl CellAgg { } } } - - /// Track min/max ordinal index for selected enum features only. - #[inline] - fn add_enums_selective( - &mut self, - enum_data: &[u8], - row: usize, - num_enums: usize, - indices: &[usize], - ) { - let base = row * num_enums; - for &enum_index in indices { - let value = enum_data[base + enum_index]; - if value != ENUM_NULL { - if self.enum_mins[enum_index] == ENUM_NULL || value < self.enum_mins[enum_index] { - self.enum_mins[enum_index] = value; - } - if value > self.enum_maxs[enum_index] { - self.enum_maxs[enum_index] = value; - } - } - } - } - - /// Track postcode and centroid for high-resolution cells. - /// Uses simple "first seen" approach — at res 11/12, most rows in a cell share a postcode. - #[inline] - fn add_postcode(&mut self, postcode: &str, lat: f32, lon: f32) { - self.lat_sum += lat as f64; - self.lon_sum += lon as f64; - if postcode.is_empty() { - return; - } - if self.postcode.is_none() { - self.postcode = Some(postcode.to_string()); - self.postcode_count = 1; - } else if self.postcode.as_deref() == Some(postcode) { - self.postcode_count += 1; - } - } } -/// Escape a string for inclusion in a JSON string literal. -pub(crate) fn write_json_escaped(buf: &mut String, text: &str) { - for character in text.chars() { - match character { - '"' => buf.push_str("\\\""), - '\\' => buf.push_str("\\\\"), - '\n' => buf.push_str("\\n"), - '\r' => buf.push_str("\\r"), - '\t' => buf.push_str("\\t"), - ctrl if ctrl < '\x20' => { - let _ = write!(buf, "\\u{:04x}", ctrl as u32); - } - other => buf.push(other), - } - } -} - -/// Write the hexagons JSON response directly to a String buffer, -/// avoiding serde_json::Value allocations entirely. -/// When `numeric_indices` / `enum_indices` are Some, only those features are written. -#[allow(clippy::too_many_arguments)] -fn write_hexagons_json( - buf: &mut String, +/// Build feature maps from aggregated cell data. +fn build_feature_maps( groups: &FxHashMap, min_keys: &[String], max_keys: &[String], num_features: usize, - enum_min_keys: &[String], - enum_max_keys: &[String], - num_enums: usize, - include_postcode: bool, - numeric_indices: Option<&[usize]>, - enum_indices: Option<&[usize]>, -) { - buf.push_str("{\"features\":["); - let mut first = true; + indices: Option<&[usize]>, +) -> Vec> { + let mut features = Vec::with_capacity(groups.len()); + for (&cell_id, aggregation) in groups { let Some(cell) = h3o::CellIndex::try_from(cell_id).ok() else { continue; }; - if !first { - buf.push(','); - } - first = false; + let mut map = Map::new(); + map.insert("h3".into(), Value::String(cell.to_string())); + map.insert("count".into(), Value::Number(aggregation.count.into())); - let _ = write!(buf, "{{\"h3\":\"{}\",\"count\":{}", cell, aggregation.count); - - if let Some(indices) = numeric_indices { - for &feat_index in indices { - if aggregation.mins[feat_index].is_finite() - && aggregation.maxs[feat_index].is_finite() - { - let _ = write!( - buf, - ",\"{}\":{},\"{}\":{}", - min_keys[feat_index], - aggregation.mins[feat_index], - max_keys[feat_index], - aggregation.maxs[feat_index] - ); - } - } + let iter: Box> = if let Some(idx) = indices { + Box::new(idx.iter().copied()) } else { - for feat_index in 0..num_features { - if aggregation.mins[feat_index].is_finite() - && aggregation.maxs[feat_index].is_finite() - { - let _ = write!( - buf, - ",\"{}\":{},\"{}\":{}", - min_keys[feat_index], - aggregation.mins[feat_index], - max_keys[feat_index], - aggregation.maxs[feat_index] - ); + Box::new(0..num_features) + }; + + for feat_index in iter { + if aggregation.mins[feat_index].is_finite() + && aggregation.maxs[feat_index].is_finite() + { + if let (Some(min_num), Some(max_num)) = ( + serde_json::Number::from_f64(aggregation.mins[feat_index] as f64), + serde_json::Number::from_f64(aggregation.maxs[feat_index] as f64), + ) { + map.insert(min_keys[feat_index].clone(), Value::Number(min_num)); + map.insert(max_keys[feat_index].clone(), Value::Number(max_num)); } } } - if let Some(indices) = enum_indices { - for &enum_index in indices { - if aggregation.enum_mins[enum_index] != ENUM_NULL { - let _ = write!( - buf, - ",\"{}\":{},\"{}\":{}", - enum_min_keys[enum_index], - aggregation.enum_mins[enum_index], - enum_max_keys[enum_index], - aggregation.enum_maxs[enum_index] - ); - } - } - } else { - for enum_index in 0..num_enums { - if aggregation.enum_mins[enum_index] != ENUM_NULL { - let _ = write!( - buf, - ",\"{}\":{},\"{}\":{}", - enum_min_keys[enum_index], - aggregation.enum_mins[enum_index], - enum_max_keys[enum_index], - aggregation.enum_maxs[enum_index] - ); - } - } - } - - if include_postcode { - if let Some(ref postcode) = aggregation.postcode { - let total = aggregation.count as f64; - let centroid_lat = aggregation.lat_sum / total; - let centroid_lon = aggregation.lon_sum / total; - if centroid_lat.is_finite() && centroid_lon.is_finite() { - buf.push_str(",\"postcode\":\""); - write_json_escaped(buf, postcode); - let _ = write!(buf, "\",\"lat\":{},\"lon\":{}", centroid_lat, centroid_lon); - } - } - } - - buf.push('}'); + features.push(map); } - buf.push_str("]}"); + + features } pub async fn get_hexagons( state: Arc, Query(params): Query, -) -> Result { +) -> Result, (StatusCode, String)> { let resolution = params.resolution; if !(H3_REQUEST_MIN..=H3_REQUEST_MAX).contains(&resolution) { warn!( @@ -346,50 +181,40 @@ pub async fn get_hexagons( let (parsed_filters, parsed_enum_filters) = parse_filters( params.filters.as_deref(), &state.data.feature_names, - &state.data.enum_features, + &state.data.enum_values, ); let num_filters = parsed_filters.len() + parsed_enum_filters.len(); - // Parse optional `fields` param into numeric and enum index sets. + // Parse optional `fields` param into feature indices. // If `fields` is absent (None), all features are included. // If `fields` is present (even empty string), only listed features are included. - let field_indices: Option<(Vec, Vec)> = - params.fields.as_ref().map(|fields_str| { - let mut numeric_indices = Vec::new(); - let mut enum_indices = Vec::new(); - if !fields_str.is_empty() { - for name in fields_str.split(',') { - let name = name.trim(); - if name.is_empty() { - continue; - } - if let Some(idx) = state - .data - .feature_names - .iter() - .position(|feat| feat == name) - { - numeric_indices.push(idx); - } else if let Some(&idx) = state.enum_name_to_idx.get(name) { - enum_indices.push(idx); - } + let field_indices: Option> = params.fields.as_ref().map(|fields_str| { + if fields_str.is_empty() { + return Vec::new(); + } + fields_str + .split(',') + .filter_map(|name| { + let name = name.trim(); + if name.is_empty() { + return None; } - } - (numeric_indices, enum_indices) - }); + state + .data + .feature_names + .iter() + .position(|feat| feat == name) + }) + .collect() + }); - let json_body = tokio::task::spawn_blocking(move || -> Result { + let response = tokio::task::spawn_blocking(move || -> Result { let t0 = std::time::Instant::now(); let num_features = state.data.num_features; - let num_enums = state.data.num_enums; let feature_data = &state.data.feature_data; - let enum_data = &state.data.enum_data; - let min_keys = &state.min_keys; let max_keys = &state.max_keys; - let enum_min_keys = &state.enum_min_keys; - let enum_max_keys = &state.enum_max_keys; let h3_res = h3o::Resolution::try_from(resolution) .map_err(|error| format!("Invalid H3 resolution {}: {}", resolution, error))?; @@ -398,50 +223,20 @@ pub async fn get_hexagons( let mut groups: FxHashMap = FxHashMap::default(); - let include_postcode = resolution >= POSTCODE_MIN_RESOLUTION; - - // Row-level filter check: numeric must be non-NaN and within [min, max], - // enum must have value index in the allowed set - let row_passes = |row: usize| -> bool { - parsed_filters.iter().all(|filter| { - let value = feature_data[row * num_features + filter.feat_idx]; - value.is_finite() && value >= filter.min && value <= filter.max - }) && parsed_enum_filters.iter().all(|enum_filter| { - let value = enum_data[row * num_enums + enum_filter.enum_idx]; - value != ENUM_NULL && enum_filter.allowed.contains(&value) - }) - }; - - // Choose aggregation strategy based on whether fields are specified let has_selective = field_indices.is_some(); - let (sel_numeric, sel_enum) = field_indices - .as_ref() - .map_or((&[][..], &[][..]), |(ni, ei)| { - (ni.as_slice(), ei.as_slice()) - }); + let sel_indices = field_indices.as_deref().unwrap_or(&[]); let aggregate_row = |groups: &mut FxHashMap, cell_id: u64, row: usize| { let aggregation = groups .entry(cell_id) - .or_insert_with(|| CellAgg::new(num_features, num_enums)); + .or_insert_with(|| CellAgg::new(num_features)); if has_selective { - aggregation.add_row_selective(feature_data, row, num_features, sel_numeric); - aggregation.add_enums_selective(enum_data, row, num_enums, sel_enum); + aggregation.add_row_selective(feature_data, row, num_features, sel_indices); } else { aggregation.add_row(feature_data, row, num_features); - aggregation.add_enums(enum_data, row, num_enums); - } - if include_postcode { - aggregation.add_postcode( - state.data.postcode(row), - state.data.lat[row], - state.data.lon[row], - ); } }; - // Resolve cell at requested resolution from precomputed max-resolution cell. - // For max resolution, use directly; for lower resolutions, derive parent. let cell_for_row = |row: usize| -> u64 { let max_cell = precomputed[row]; if !need_parent || max_cell == 0 { @@ -458,7 +253,13 @@ pub async fn get_hexagons( .grid .for_each_in_bounds(south, west, north, east, |row_idx| { let row = row_idx as usize; - if !row_passes(row) { + if !row_passes_filters( + row, + &parsed_filters, + &parsed_enum_filters, + feature_data, + num_features, + ) { return; } aggregate_row(&mut groups, cell_for_row(row), row); @@ -466,19 +267,12 @@ pub async fn get_hexagons( let t_agg = t0.elapsed(); - let mut json_buf = String::with_capacity(groups.len() * 128); - write_hexagons_json( - &mut json_buf, + let features = build_feature_maps( &groups, min_keys, max_keys, num_features, - enum_min_keys, - enum_max_keys, - num_enums, - include_postcode, - field_indices.as_ref().map(|(ni, _)| ni.as_slice()), - field_indices.as_ref().map(|(_, ei)| ei.as_slice()), + field_indices.as_deref(), ); let t_total = t0.elapsed(); @@ -489,15 +283,14 @@ pub async fn get_hexagons( filters_raw = filters_str.as_deref().unwrap_or("-"), agg_ms = format_args!("{:.1}", t_agg.as_secs_f64() * 1000.0), total_ms = format_args!("{:.1}", t_total.as_secs_f64() * 1000.0), - size = format_args!("{}", HumanBytes(json_buf.len())), "GET /api/hexagons" ); - Ok(json_buf) + Ok(HexagonsResponse { features }) }) .await .map_err(|error| (StatusCode::INTERNAL_SERVER_ERROR, error.to_string()))? .map_err(|error| (StatusCode::INTERNAL_SERVER_ERROR, error))?; - Ok(([("content-type", "application/json")], json_body)) + Ok(Json(response)) } diff --git a/server-rs/src/routes/og_image.rs b/server-rs/src/routes/og_image.rs index 0c2bb70..7dd0c8e 100644 --- a/server-rs/src/routes/og_image.rs +++ b/server-rs/src/routes/og_image.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use axum::extract::Query; use axum::http::{header, StatusCode}; use axum::response::IntoResponse; +use tracing::{info, warn}; use crate::state::AppState; @@ -48,7 +49,7 @@ pub async fn get_og_image( }; let url = format!("{}/screenshot{}", sidecar_url, qs); - tracing::info!("Proxying OG screenshot request to: {}", url); + info!("Proxying OG screenshot request to: {}", url); match state.http_client.get(&url).send().await { Ok(resp) if resp.status().is_success() => match resp.bytes().await { @@ -62,18 +63,18 @@ pub async fn get_og_image( ) .into_response(), Err(err) => { - tracing::warn!("Failed to read sidecar response: {}", err); + warn!("Failed to read sidecar response: {}", err); (StatusCode::BAD_GATEWAY, "Failed to read screenshot").into_response() } }, Ok(resp) => { let status = resp.status(); let body = resp.text().await.unwrap_or_default(); - tracing::warn!("Sidecar returned status {}: {}", status, body); + warn!("Sidecar returned status {}: {}", status, body); (StatusCode::BAD_GATEWAY, "Screenshot sidecar error").into_response() } Err(err) => { - tracing::warn!("Failed to reach sidecar: {}", err); + warn!("Failed to reach sidecar: {}", err); (StatusCode::BAD_GATEWAY, "Screenshot sidecar unavailable").into_response() } } diff --git a/server-rs/src/routes/pois.rs b/server-rs/src/routes/pois.rs index 7ba89f7..06f9e48 100644 --- a/server-rs/src/routes/pois.rs +++ b/server-rs/src/routes/pois.rs @@ -2,15 +2,31 @@ use std::sync::Arc; use axum::extract::Query; use axum::http::StatusCode; -use axum::response::{IntoResponse, Json}; +use axum::response::Json; use serde::{Deserialize, Serialize}; use tracing::info; use crate::consts::MAX_POIS_PER_REQUEST; -use crate::state::{AppState, POICategoryGroup}; +use crate::data::POICategoryGroup; +use crate::parsing::parse_bounds; +use crate::state::AppState; -use super::hexagons::write_json_escaped; -use super::parse::parse_bounds; +#[derive(Serialize)] +#[allow(clippy::upper_case_acronyms)] +pub struct POI { + id: String, + name: String, + category: String, + group: String, + lat: f32, + lng: f32, + emoji: String, +} + +#[derive(Serialize)] +pub struct POIsResponse { + pois: Vec, +} #[derive(Deserialize)] pub struct POIParams { @@ -22,7 +38,7 @@ pub struct POIParams { pub async fn get_pois( state: Arc, Query(params): Query, -) -> Result { +) -> Result, (StatusCode, String)> { let bounds_str = params.bounds.ok_or(( StatusCode::BAD_REQUEST, "bounds parameter is required".into(), @@ -43,12 +59,10 @@ pub async fn get_pois( let num_categories = category_filter.as_ref().map(|cats| cats.len()).unwrap_or(0); - let json_body = tokio::task::spawn_blocking(move || { + let pois = tokio::task::spawn_blocking(move || { let t0 = std::time::Instant::now(); let row_indices = state.poi_grid.query(south, west, north, east); - // Collect matching row indices first, then sample randomly so the - // subset covers the viewport uniformly instead of clustering in one area. let mut matching_rows: Vec = row_indices .iter() .filter_map(|&row_idx| { @@ -73,36 +87,22 @@ pub async fn get_pois( } } - // Write JSON directly to string buffer, avoiding intermediate POI allocations - let mut buf = String::with_capacity(matching_rows.len() * 128); - buf.push_str("{\"pois\":["); - - for (i, &row) in matching_rows.iter().enumerate() { - if i > 0 { - buf.push(','); - } - buf.push_str("{\"id\":\""); - write_json_escaped(&mut buf, &state.poi_data.id[row]); - buf.push_str("\",\"name\":\""); - write_json_escaped(&mut buf, &state.poi_data.name[row]); - buf.push_str("\",\"category\":\""); - write_json_escaped(&mut buf, state.poi_data.category.get(row)); - buf.push_str("\",\"group\":\""); - write_json_escaped(&mut buf, state.poi_data.group.get(row)); - buf.push_str("\",\"lat\":"); - buf.push_str(&state.poi_data.lat[row].to_string()); - buf.push_str(",\"lng\":"); - buf.push_str(&state.poi_data.lng[row].to_string()); - buf.push_str(",\"emoji\":\""); - write_json_escaped(&mut buf, state.poi_data.emoji.get(row)); - buf.push_str("\"}"); - } - - buf.push_str("]}"); + let pois: Vec = matching_rows + .iter() + .map(|&row| POI { + id: state.poi_data.id[row].clone(), + name: state.poi_data.name[row].clone(), + category: state.poi_data.category.get(row).to_string(), + group: state.poi_data.group.get(row).to_string(), + lat: state.poi_data.lat[row], + lng: state.poi_data.lng[row], + emoji: state.poi_data.emoji.get(row).to_string(), + }) + .collect(); let elapsed = t0.elapsed(); info!( - results = matching_rows.len(), + results = pois.len(), candidates = row_indices.len(), categories = num_categories, categories_raw = categories_str.as_deref().unwrap_or("-"), @@ -110,12 +110,12 @@ pub async fn get_pois( "GET /api/pois" ); - buf + pois }) .await .map_err(|error| (StatusCode::INTERNAL_SERVER_ERROR, error.to_string()))?; - Ok(([("content-type", "application/json")], json_body)) + Ok(Json(POIsResponse { pois })) } #[derive(Serialize)] diff --git a/server-rs/src/routes/properties.rs b/server-rs/src/routes/properties.rs index d871c44..d851f1c 100644 --- a/server-rs/src/routes/properties.rs +++ b/server-rs/src/routes/properties.rs @@ -9,15 +9,12 @@ use serde::{Deserialize, Serialize}; use tracing::{info, warn}; use crate::consts::{ - DEFAULT_PROPERTIES_LIMIT, ENUM_NULL, H3_PRECOMPUTE_MAX, H3_REQUEST_MAX, H3_REQUEST_MIN, + DEFAULT_PROPERTIES_LIMIT, H3_PRECOMPUTE_MAX, H3_REQUEST_MAX, H3_REQUEST_MIN, MAX_PROPERTIES_LIMIT, }; -use crate::data::EnumFeatureData; -use crate::filter::{parse_filters, row_passes_filters}; +use crate::parsing::{h3_cell_bounds, parse_filters, row_passes_filters}; use crate::state::AppState; -use super::parse::h3_cell_bounds; - #[derive(Deserialize)] pub struct HexagonPropertiesParams { pub h3: String, @@ -66,21 +63,25 @@ fn non_empty_string(text: &str) -> Option { } } +/// Look up an enum feature value by trying multiple possible column names. +/// Uses the unified feature model: enum values stored as f32 indices in feature_data. fn lookup_enum_value( - enum_features: &[EnumFeatureData], - enum_data: &[u8], - num_enums: usize, - enum_idx: &FxHashMap, + feature_names: &[String], + feature_data: &[f32], + num_features: usize, + enum_values: &FxHashMap>, row: usize, names: &[&str], ) -> Option { for name in names { - if let Some(&feature_index) = enum_idx.get(*name) { - let enum_feature = &enum_features[feature_index]; - let data_index = enum_data[row * num_enums + feature_index]; - if data_index != ENUM_NULL { - if let Some(value) = enum_feature.values.get(data_index as usize) { - return Some(value.clone()); + if let Some(feat_idx) = feature_names.iter().position(|feat_name| feat_name == *name) { + if let Some(values) = enum_values.get(&feat_idx) { + let value = feature_data[row * num_features + feat_idx]; + if value.is_finite() { + let idx = value as usize; + if let Some(str_value) = values.get(idx) { + return Some(str_value.clone()); + } } } } @@ -120,7 +121,7 @@ pub async fn get_hexagon_properties( let (parsed_filters, parsed_enum_filters) = parse_filters( params.filters.as_deref(), &state.data.feature_names, - &state.data.enum_features, + &state.data.enum_values, ); let num_filters = parsed_filters.len() + parsed_enum_filters.len(); @@ -131,10 +132,9 @@ pub async fn get_hexagon_properties( .map_err(|err| format!("Invalid H3 resolution {}: {}", resolution, err))?; let need_parent = resolution < H3_PRECOMPUTE_MAX; let num_features = state.data.num_features; - let num_enums = state.data.num_enums; let feature_data = &state.data.feature_data; - let enum_data_flat = &state.data.enum_data; - let enum_features = &state.data.enum_features; + let feature_names = &state.data.feature_names; + let enum_values = &state.data.enum_values; let (min_lat, min_lon, max_lat, max_lon) = h3_cell_bounds(cell, 0.001); @@ -162,8 +162,6 @@ pub async fn get_hexagon_properties( &parsed_enum_filters, feature_data, num_features, - enum_data_flat, - num_enums, ) { matching_rows.push(row); @@ -185,7 +183,11 @@ pub async fn get_hexagon_properties( .map(|&row| { let mut features = FxHashMap::default(); let base = row * num_features; - for (feat_idx, feat_name) in state.data.feature_names.iter().enumerate() { + for (feat_idx, feat_name) in feature_names.iter().enumerate() { + // Skip enum features in the generic features map + if enum_values.contains_key(&feat_idx) { + continue; + } let value = feature_data[base + feat_idx]; if value.is_finite() { features.insert(feat_name.clone(), value); @@ -197,42 +199,42 @@ pub async fn get_hexagon_properties( postcode: non_empty_string(state.data.postcode(row)), is_construction_date_approximate: Some(state.data.is_approx_build_date(row)), property_type: lookup_enum_value( - enum_features, - enum_data_flat, - num_enums, - &state.enum_name_to_idx, + feature_names, + feature_data, + num_features, + enum_values, row, &["Property type", "epc_property_type", "pp_property_type"], ), built_form: lookup_enum_value( - enum_features, - enum_data_flat, - num_enums, - &state.enum_name_to_idx, + feature_names, + feature_data, + num_features, + enum_values, row, &["Property type/built form", "built_form"], ), duration: lookup_enum_value( - enum_features, - enum_data_flat, - num_enums, - &state.enum_name_to_idx, + feature_names, + feature_data, + num_features, + enum_values, row, &["Leashold/Freehold", "duration"], ), current_energy_rating: lookup_enum_value( - enum_features, - enum_data_flat, - num_enums, - &state.enum_name_to_idx, + feature_names, + feature_data, + num_features, + enum_values, row, &["Current energy rating", "current_energy_rating"], ), potential_energy_rating: lookup_enum_value( - enum_features, - enum_data_flat, - num_enums, - &state.enum_name_to_idx, + feature_names, + feature_data, + num_features, + enum_values, row, &["Potential energy rating", "potential_energy_rating"], ), diff --git a/server-rs/src/state.rs b/server-rs/src/state.rs index 35b69c3..5b52ec4 100644 --- a/server-rs/src/state.rs +++ b/server-rs/src/state.rs @@ -1,14 +1,6 @@ -use rustc_hash::FxHashMap; -use serde::Serialize; - -use crate::data::{POIData, PropertyData}; -use crate::grid_index::GridIndex; - -#[derive(Serialize, Clone)] -pub struct POICategoryGroup { - pub name: String, - pub categories: Vec, -} +use crate::data::{POICategoryGroup, POIData, PropertyData}; +use crate::routes::FeaturesResponse; +use crate::utils::GridIndex; pub struct AppState { pub data: PropertyData, @@ -18,18 +10,14 @@ pub struct AppState { pub h3_cells: Vec, pub poi_data: POIData, pub poi_grid: GridIndex, - /// Precomputed JSON key names: "min_{feature_name}" for each numeric feature + /// Precomputed JSON key names: "min_{feature_name}" for each feature pub min_keys: Vec, - /// Precomputed JSON key names: "max_{feature_name}" for each numeric feature + /// Precomputed JSON key names: "max_{feature_name}" for each feature pub max_keys: Vec, - /// Precomputed JSON key names: "min_{enum_name}" for each enum feature - pub enum_min_keys: Vec, - /// Precomputed JSON key names: "max_{enum_name}" for each enum feature - pub enum_max_keys: Vec, /// Precomputed POI category groups (sorted) pub poi_category_groups: Vec, - /// Precomputed map from enum feature name to index in data.enum_features - pub enum_name_to_idx: FxHashMap, + /// Precomputed features response for /api/features endpoint + pub features_response: FeaturesResponse, /// URL of the OG screenshot sidecar service (e.g. http://og-screenshot:8002) pub og_sidecar_url: Option, /// Public-facing URL for absolute og:image URLs (e.g. https://narrowit.schmelczer.dev) diff --git a/server-rs/src/tests.rs b/server-rs/src/tests.rs deleted file mode 100644 index 467d265..0000000 --- a/server-rs/src/tests.rs +++ /dev/null @@ -1,251 +0,0 @@ -#[cfg(test)] -mod grid_index_tests { - use crate::grid_index::GridIndex; - - #[test] - fn query_bounds_fully_below_grid_returns_empty() { - let lat = vec![50.0_f32, 50.5, 51.0]; - let lon = vec![0.0_f32, 0.5, 1.0]; - let grid = GridIndex::build(&lat, &lon, 0.01); - - let results = grid.query(10.0, -10.0, 20.0, -5.0); - assert!( - results.is_empty(), - "Should return empty for bounds fully below grid" - ); - } - - #[test] - fn query_bounds_fully_above_grid_returns_empty() { - let lat = vec![50.0_f32, 50.5, 51.0]; - let lon = vec![0.0_f32, 0.5, 1.0]; - let grid = GridIndex::build(&lat, &lon, 0.01); - - let results = grid.query(80.0, 50.0, 90.0, 60.0); - assert!( - results.is_empty(), - "Should return empty for bounds fully above grid" - ); - } - - #[test] - fn query_inverted_bounds_returns_empty() { - let lat = vec![50.0_f32, 50.5, 51.0]; - let lon = vec![0.0_f32, 0.5, 1.0]; - let grid = GridIndex::build(&lat, &lon, 0.01); - - // south > north - let results = grid.query(52.0, 0.0, 49.0, 1.0); - assert!( - results.is_empty(), - "Should return empty for inverted bounds" - ); - } - - #[test] - fn for_each_bounds_fully_outside_yields_nothing() { - let lat = vec![50.0_f32, 50.5, 51.0]; - let lon = vec![0.0_f32, 0.5, 1.0]; - let grid = GridIndex::build(&lat, &lon, 0.01); - - let mut count = 0; - grid.for_each_in_bounds(10.0, -10.0, 20.0, -5.0, |_| count += 1); - assert_eq!( - count, 0, - "for_each should yield nothing for out-of-bounds query" - ); - } - - #[test] - fn query_with_large_cells_outside_returns_empty() { - // Previously, out-of-bounds queries with large cell sizes would - // scan cell (0,0) which could contain data. Now returns empty. - let lat = vec![50.0_f32]; - let lon = vec![0.0_f32]; - let grid = GridIndex::build(&lat, &lon, 1.0); - - let results = grid.query(0.0, -50.0, 10.0, -40.0); - assert!( - results.is_empty(), - "Should return empty even with large cell size" - ); - } - - #[test] - fn query_within_bounds_returns_correct_results() { - let lat = vec![50.0_f32, 50.5, 51.0]; - let lon = vec![0.0_f32, 0.5, 1.0]; - let grid = GridIndex::build(&lat, &lon, 0.01); - - let results = grid.query(49.9, -0.1, 51.1, 1.1); - assert_eq!(results.len(), 3, "Should return all 3 points within bounds"); - } - - #[test] - fn query_partial_bounds_returns_subset() { - let lat = vec![50.0_f32, 51.0, 52.0]; - let lon = vec![0.0_f32, 0.0, 0.0]; - let grid = GridIndex::build(&lat, &lon, 0.01); - - let results = grid.query(49.9, -0.1, 50.1, 0.1); - assert_eq!(results.len(), 1, "Should return only the point at lat=50"); - } -} - -#[cfg(test)] -mod filter_tests { - use crate::data::EnumFeatureData; - use crate::filter::{parse_filters, row_passes_filters}; - - #[test] - fn nan_rows_fail_numeric_filter_even_with_infinite_range() { - let feature_names = vec!["price".to_string()]; - let feature_data = vec![f32::NAN]; - let enum_features: Vec = vec![]; - let enum_data: Vec = vec![]; - - let (numeric, enums) = - parse_filters(Some("price:-inf:inf"), &feature_names, &enum_features); - assert_eq!(numeric.len(), 1, "Should parse -inf:inf as valid filter"); - - let passes = row_passes_filters(0, &numeric, &enums, &feature_data, 1, &enum_data, 0); - assert!(!passes, "NaN should fail filter even with infinite range"); - } - - #[test] - fn empty_enum_filter_value_rejects_all() { - let enum_features = vec![EnumFeatureData { - name: "rating".to_string(), - values: vec!["A".to_string(), "B".to_string()], - }]; - let feature_names: Vec = vec![]; - // Row-major enum data: 1 row, 1 enum, value=0 (index into "A") - let enum_data: Vec = vec![0]; - - let (numeric, enums) = parse_filters(Some("rating:"), &feature_names, &enum_features); - assert_eq!(enums.len(), 1); - assert!(enums[0].allowed.is_empty()); - - let passes = row_passes_filters(0, &numeric, &enums, &[], 0, &enum_data, 1); - assert!(!passes, "Empty allowed set should reject all rows"); - } - - #[test] - fn enum_filter_with_nonexistent_values_produces_empty_allowed() { - let enum_features = vec![EnumFeatureData { - name: "rating".to_string(), - values: vec!["A".to_string(), "B".to_string()], - }]; - let feature_names: Vec = vec![]; - - let (_, enums) = parse_filters(Some("rating:X|Y|Z"), &feature_names, &enum_features); - assert_eq!(enums.len(), 1); - assert!(enums[0].allowed.is_empty()); - } - - #[test] - fn malformed_numeric_min_is_silently_skipped() { - let feature_names = vec!["price".to_string()]; - let enum_features: Vec = vec![]; - - let (numeric, enums) = parse_filters( - Some("price:not_a_number:200"), - &feature_names, - &enum_features, - ); - assert_eq!(numeric.len(), 0); - assert_eq!(enums.len(), 0); - } -} - -#[cfg(test)] -mod json_tests { - #[test] - fn json_escaped_postcode_with_quotes_is_valid() { - use crate::routes::hexagons::write_json_escaped; - - let mut buf = String::new(); - buf.push_str("{\"postcode\":\""); - write_json_escaped(&mut buf, "SW1A \"test"); - buf.push_str("\"}"); - - let result: Result = serde_json::from_str(&buf); - assert!( - result.is_ok(), - "Escaped quote should produce valid JSON: {}", - buf - ); - assert_eq!(result.unwrap()["postcode"].as_str().unwrap(), "SW1A \"test"); - } - - #[test] - fn json_escaped_postcode_with_backslash_is_valid() { - use crate::routes::hexagons::write_json_escaped; - - let mut buf = String::new(); - buf.push_str("{\"postcode\":\""); - write_json_escaped(&mut buf, "SW1A\\2AA"); - buf.push_str("\"}"); - - let result: Result = serde_json::from_str(&buf); - assert!( - result.is_ok(), - "Escaped backslash should produce valid JSON: {}", - buf - ); - assert_eq!(result.unwrap()["postcode"].as_str().unwrap(), "SW1A\\2AA"); - } - - #[test] - fn nan_is_not_valid_json() { - use std::fmt::Write; - // Verify that raw NaN in write! is still invalid JSON (documenting the risk - // that the is_finite() guard in write_hexagons_json protects against). - let mut buf = String::new(); - write!(buf, "{{\"min_price\":{}}}", f64::NAN).unwrap(); - - let result: Result = serde_json::from_str(&buf); - assert!(result.is_err(), "Raw NaN should produce invalid JSON"); - } - - #[test] - fn infinity_is_not_valid_json() { - use std::fmt::Write; - let mut buf = String::new(); - write!(buf, "{{\"min_price\":{}}}", f64::INFINITY).unwrap(); - - let result: Result = serde_json::from_str(&buf); - assert!(result.is_err(), "Raw Infinity should produce invalid JSON"); - } -} - -#[cfg(test)] -mod enum_encoding_tests { - #[test] - fn u8_cast_wraps_around_beyond_255() { - // Documents the underlying u8 wrapping behavior that the truncation - // guard in property.rs now prevents. - let num_values = 300usize; - let indices: Vec = (0..num_values).map(|index| index as u8).collect(); - - assert_eq!(indices[0], indices[256], "u8 wraps: 0 == 256"); - assert_eq!(indices[1], indices[257], "u8 wraps: 1 == 257"); - - use std::collections::HashMap; - let values: Vec = (0..num_values).map(|i| format!("val_{}", i)).collect(); - let value_to_idx: HashMap<&str, u8> = values - .iter() - .enumerate() - .map(|(index, value)| (value.as_str(), index as u8)) - .collect(); - - let unique_indices: std::collections::HashSet = - value_to_idx.values().cloned().collect(); - assert!( - unique_indices.len() < num_values, - "Without the truncation guard, {} values produce only {} unique u8 indices", - num_values, - unique_indices.len() - ); - } -} diff --git a/server-rs/src/utils.rs b/server-rs/src/utils.rs new file mode 100644 index 0000000..c2bd3f1 --- /dev/null +++ b/server-rs/src/utils.rs @@ -0,0 +1,7 @@ +mod grid_index; +mod hash; +mod interned_column; + +pub use grid_index::GridIndex; +pub use hash::{generate_priorities, splitmix64_hash}; +pub use interned_column::InternedColumn; diff --git a/server-rs/src/grid_index.rs b/server-rs/src/utils/grid_index.rs similarity index 83% rename from server-rs/src/grid_index.rs rename to server-rs/src/utils/grid_index.rs index e8d2967..079e03a 100644 --- a/server-rs/src/grid_index.rs +++ b/server-rs/src/utils/grid_index.rs @@ -1,3 +1,5 @@ +use tracing::debug; + /// Grid-based spatial index for fast rectangle queries over property rows. /// /// Divides the bounding box into cells of ~0.01 degrees (~1km). @@ -19,6 +21,18 @@ pub struct GridIndex { impl GridIndex { pub fn build(lat: &[f32], lon: &[f32], cell_size: f32) -> Self { + if lat.is_empty() { + return GridIndex { + min_lat: 0.0, + min_lon: 0.0, + cell_size, + cols: 0, + rows: 0, + values: Vec::new(), + offsets: vec![0], + }; + } + let mut min_lat = f32::INFINITY; let mut max_lat = f32::NEG_INFINITY; let mut min_lon = f32::INFINITY; @@ -48,7 +62,7 @@ impl GridIndex { let cols = ((max_lon - min_lon) / cell_size).ceil() as usize + 1; let num_cells = rows * cols; - tracing::debug!( + debug!( rows_grid = rows, cols_grid = cols, total_cells = num_cells, @@ -86,7 +100,7 @@ impl GridIndex { cursors[cell_index] += 1; } - tracing::debug!("Grid index built (CSR)"); + debug!("Grid index built (CSR)"); GridIndex { min_lat, @@ -184,3 +198,33 @@ impl GridIndex { Some((row_min, row_max, col_min, col_max)) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn query_returns_correct_indices() { + let lat = vec![50.0_f32, 50.5, 51.0]; + let lon = vec![0.0_f32, 0.5, 1.0]; + let grid = GridIndex::build(&lat, &lon, 0.1); + + let results = grid.query(49.9, -0.1, 50.1, 0.1); + assert_eq!(results, vec![0]); + } + + #[test] + fn query_outside_bounds_returns_empty() { + let lat = vec![50.0_f32]; + let lon = vec![0.0_f32]; + let grid = GridIndex::build(&lat, &lon, 0.1); + + assert!(grid.query(0.0, 0.0, 1.0, 1.0).is_empty()); + } + + #[test] + fn empty_input_returns_empty_results() { + let grid = GridIndex::build(&[], &[], 0.1); + assert!(grid.query(-90.0, -180.0, 90.0, 180.0).is_empty()); + } +} diff --git a/server-rs/src/utils/hash.rs b/server-rs/src/utils/hash.rs new file mode 100644 index 0000000..178ed26 --- /dev/null +++ b/server-rs/src/utils/hash.rs @@ -0,0 +1,39 @@ +/// Generate a deterministic pseudo-random priority value from an index using splitmix64. +/// Used for shuffling rows in a deterministic but random-looking order. +#[inline] +pub fn splitmix64_hash(index: usize) -> u32 { + let mut hash = (index as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15); + hash = (hash ^ (hash >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9); + hash = (hash ^ (hash >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB); + hash = hash ^ (hash >> 31); + hash as u32 +} + +/// Generate priority values for a range of indices. +pub fn generate_priorities(row_count: usize) -> Vec { + (0..row_count).map(splitmix64_hash).collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn deterministic() { + assert_eq!(splitmix64_hash(0), splitmix64_hash(0)); + assert_eq!(splitmix64_hash(12345), splitmix64_hash(12345)); + } + + #[test] + fn different_inputs_differ() { + assert_ne!(splitmix64_hash(0), splitmix64_hash(1)); + assert_ne!(splitmix64_hash(100), splitmix64_hash(101)); + } + + #[test] + fn generate_priorities_length() { + assert_eq!(generate_priorities(0).len(), 0); + assert_eq!(generate_priorities(5).len(), 5); + assert_eq!(generate_priorities(1000).len(), 1000); + } +} diff --git a/server-rs/src/utils/interned_column.rs b/server-rs/src/utils/interned_column.rs new file mode 100644 index 0000000..8308657 --- /dev/null +++ b/server-rs/src/utils/interned_column.rs @@ -0,0 +1,68 @@ +/// Interned string column: a small set of unique values indexed by u16 per row. +pub struct InternedColumn { + pub values: Vec, + pub indices: Vec, +} + +impl InternedColumn { + pub fn build(raw: &[String]) -> Self { + let mut unique_map: rustc_hash::FxHashMap<&str, u16> = rustc_hash::FxHashMap::default(); + let mut values: Vec = Vec::new(); + let mut indices = Vec::with_capacity(raw.len()); + + for text in raw { + let idx = if let Some(&existing) = unique_map.get(text.as_str()) { + existing + } else { + assert!( + values.len() < u16::MAX as usize, + "InternedColumn overflow: more than {} unique values", + u16::MAX + ); + let idx = values.len() as u16; + values.push(text.clone()); + unique_map.insert(text.as_str(), idx); + idx + }; + indices.push(idx); + } + + InternedColumn { values, indices } + } + + /// Resolve the string for a given row. + pub fn get(&self, row: usize) -> &str { + &self.values[self.indices[row] as usize] + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn basic_interning() { + let raw: Vec = vec!["a".into(), "b".into(), "a".into(), "c".into(), "b".into()]; + let col = InternedColumn::build(&raw); + + assert_eq!(col.values, vec!["a", "b", "c"]); + assert_eq!(col.indices, vec![0, 1, 0, 2, 1]); + assert_eq!(col.get(0), "a"); + assert_eq!(col.get(2), "a"); + assert_eq!(col.get(3), "c"); + } + + #[test] + fn empty_input() { + let col = InternedColumn::build(&[]); + assert!(col.values.is_empty()); + assert!(col.indices.is_empty()); + } + + #[test] + #[should_panic(expected = "InternedColumn overflow")] + fn u16_overflow_panics() { + let raw: Vec = (0..=u16::MAX as u32).map(|i| i.to_string()).collect(); + let _col = InternedColumn::build(&raw); + } +}