From 01ec17ff04b9592a5b200288537cc3b87b1c39d3 Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Sat, 31 Jan 2026 20:25:54 +0000 Subject: [PATCH] Refactor the server --- server-rs/Cargo.lock | 92 +++++ server-rs/Cargo.toml | 4 +- server-rs/rust-toolchain.toml | 8 + server-rs/src/consts.rs | 14 +- server-rs/src/data.rs | 555 ------------------------- server-rs/src/filter.rs | 85 ++++ server-rs/src/index.rs | 18 +- server-rs/src/main.rs | 57 ++- server-rs/src/routes.rs | 636 ----------------------------- server-rs/src/routes/features.rs | 87 ++++ server-rs/src/routes/hexagons.rs | 257 ++++++++++++ server-rs/src/routes/mod.rs | 9 + server-rs/src/routes/pois.rs | 133 ++++++ server-rs/src/routes/properties.rs | 198 +++++++++ server-rs/src/state.rs | 12 + 15 files changed, 939 insertions(+), 1226 deletions(-) create mode 100644 server-rs/rust-toolchain.toml delete mode 100644 server-rs/src/data.rs create mode 100644 server-rs/src/filter.rs delete mode 100644 server-rs/src/routes.rs create mode 100644 server-rs/src/routes/features.rs create mode 100644 server-rs/src/routes/hexagons.rs create mode 100644 server-rs/src/routes/mod.rs create mode 100644 server-rs/src/routes/pois.rs create mode 100644 server-rs/src/routes/properties.rs create mode 100644 server-rs/src/state.rs diff --git a/server-rs/Cargo.lock b/server-rs/Cargo.lock index ddb90e7..abca744 100644 --- a/server-rs/Cargo.lock +++ b/server-rs/Cargo.lock @@ -921,6 +921,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + [[package]] name = "libc" version = "0.2.180" @@ -979,6 +985,15 @@ dependencies = [ "libc", ] +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + [[package]] name = "matchit" version = "0.8.4" @@ -1055,6 +1070,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -1678,6 +1702,8 @@ dependencies = [ "serde_json", "tokio", "tower-http", + "tracing", + "tracing-subscriber", ] [[package]] @@ -1935,6 +1961,15 @@ dependencies = [ "serde", ] +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + [[package]] name = "shlex" version = "1.3.0" @@ -2118,6 +2153,15 @@ dependencies = [ "syn", ] +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + [[package]] name = "tiny-keccak" version = "2.0.2" @@ -2246,9 +2290,21 @@ checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" dependencies = [ "log", "pin-project-lite", + "tracing-attributes", "tracing-core", ] +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "tracing-core" version = "0.1.36" @@ -2256,6 +2312,36 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" dependencies = [ "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", ] [[package]] @@ -2311,6 +2397,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + [[package]] name = "version_check" version = "0.9.5" diff --git a/server-rs/Cargo.toml b/server-rs/Cargo.toml index 5ddaa7d..24c355a 100644 --- a/server-rs/Cargo.toml +++ b/server-rs/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" [dependencies] axum = "0.8" -tower-http = { version = "0.6", features = ["cors", "fs", "compression-gzip"] } +tower-http = { version = "0.6", features = ["cors", "fs", "compression-gzip", "trace"] } tokio = { version = "1", features = ["full"] } polars = { version = "0.46", features = ["parquet", "lazy", "dtype-struct", "dtype-u8", "dtype-u16", "dtype-i8", "dtype-i16"] } h3o = "0.7" @@ -13,6 +13,8 @@ serde = { version = "1", features = ["derive"] } serde_json = "1" rayon = "1" rustc-hash = "2" +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] } [profile.release] opt-level = 3 diff --git a/server-rs/rust-toolchain.toml b/server-rs/rust-toolchain.toml new file mode 100644 index 0000000..3185d77 --- /dev/null +++ b/server-rs/rust-toolchain.toml @@ -0,0 +1,8 @@ +[toolchain] +channel = "stable" +targets = [ + "x86_64-unknown-linux-gnu", + "x86_64-unknown-linux-musl", + "aarch64-unknown-linux-gnu", +] +profile = "default" diff --git a/server-rs/src/consts.rs b/server-rs/src/consts.rs index a72db94..c7190cf 100644 --- a/server-rs/src/consts.rs +++ b/server-rs/src/consts.rs @@ -1,14 +1,20 @@ -/// Lower percentile for feature range reporting pub const FEATURE_PERCENTILE_LOW: f64 = 2.0; -/// Upper percentile for feature range reporting pub const FEATURE_PERCENTILE_HIGH: f64 = 98.0; pub const HISTOGRAM_BINS: usize = 100; -/// H3 resolutions to precompute at startup (covers typical zoom levels) pub const H3_PRECOMPUTE_MIN: u8 = 4; pub const H3_PRECOMPUTE_MAX: u8 = 12; -/// Columns to exclude from feature discovery pub const EXCLUDED_COLUMNS: &[&str] = &["lat", "lon"]; + +pub const EXCLUDED_STRING_COLUMNS: &[&str] = &[ + "pp_address", + "postcode", + "Address per Property Register", + "Address per EPC", + "Postcode", +]; + +pub const MAX_ENUM_CARDINALITY: usize = 50; diff --git a/server-rs/src/data.rs b/server-rs/src/data.rs deleted file mode 100644 index 2fec011..0000000 --- a/server-rs/src/data.rs +++ /dev/null @@ -1,555 +0,0 @@ -use polars::lazy::frame::LazyFrame; -use polars::prelude::*; -use rayon::prelude::*; -use serde::Serialize; -use std::path::Path; - -use crate::consts::{ - EXCLUDED_COLUMNS, FEATURE_PERCENTILE_HIGH, FEATURE_PERCENTILE_LOW, H3_PRECOMPUTE_MAX, - H3_PRECOMPUTE_MIN, HISTOGRAM_BINS, -}; - -/// Returns true if the polars DataType is numeric (integer or float) -fn is_numeric_dtype(dtype: &DataType) -> bool { - matches!( - dtype, - DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Float32 - | DataType::Float64 - ) -} - -/// Histogram for a single feature column -#[derive(Serialize, Clone)] -pub struct Histogram { - /// Left edge of first bin - pub min: f64, - /// Right edge of last bin - pub max: f64, - /// Width of each bin - pub bin_width: f64, - /// Count of values in each bin - pub counts: Vec, -} - -/// Precomputed statistics for a single feature -pub struct FeatureStats { - pub p_low: f64, - pub p_high: f64, - pub histogram: Histogram, -} - -/// Columnar storage for all property data. -/// Feature values use NaN as the null sentinel. -pub struct PropertyData { - pub lat: Vec, - pub lon: Vec, - /// Dynamically discovered numeric feature column names - pub feature_names: Vec, - /// Number of feature columns - pub num_features: usize, - /// Row-major flat array: feature_data[row * num_features + feat_idx]. - /// NaN = null. Contiguous layout for cache-friendly per-row access. - pub feature_data: Vec, - /// Precomputed stats (percentiles + histogram) for each feature - pub feature_stats: Vec, - /// String fields for property details - pub address: Vec, - pub postcode: Vec, - pub property_type: Vec, - pub built_form: Vec, - pub current_energy_rating: Vec, - pub potential_energy_rating: Vec, -} - -/// Approximate a percentile from a histogram using linear interpolation. -/// `p` is in [0, 100]. `total` is the sum of all bin counts. -fn percentile_from_histogram( - counts: &[u64], - min: f64, - bin_width: f64, - total: usize, - p: f64, -) -> f64 { - let target = (p / 100.0) * (total as f64 - 1.0); - let mut cumulative = 0u64; - for (i, &c) in counts.iter().enumerate() { - let prev = cumulative; - cumulative += c; - if cumulative as f64 > target { - // Interpolate within this bin - let frac = if c > 0 { - (target - prev as f64) / c as f64 - } else { - 0.0 - }; - return min + (i as f64 + frac) * bin_width; - } - } - // Fallback: right edge of last bin - min + counts.len() as f64 * bin_width -} - -/// Build a histogram and compute approximate percentiles in O(n) — no sort needed. -fn compute_feature_stats(vals: &[f64]) -> FeatureStats { - // Single pass: min, max, count (skipping NaN) - let mut min = f64::INFINITY; - let mut max = f64::NEG_INFINITY; - let mut count = 0usize; - for &v in vals { - if !v.is_nan() { - if v < min { - min = v; - } - if v > max { - max = v; - } - count += 1; - } - } - - if count == 0 { - return FeatureStats { - p_low: 0.0, - p_high: 0.0, - histogram: Histogram { - min: 0.0, - max: 0.0, - bin_width: 1.0, - counts: vec![0; HISTOGRAM_BINS], - }, - }; - } - - // Build histogram over full range (second pass, no sort) - let range = if max == min { 1.0 } else { max - min }; - let bin_max = min + range * (1.0 + 1e-9); - let bin_width = (bin_max - min) / HISTOGRAM_BINS as f64; - - let mut counts = vec![0u64; HISTOGRAM_BINS]; - for &v in vals { - if !v.is_nan() { - let bin = ((v - min) / bin_width) as usize; - counts[bin.min(HISTOGRAM_BINS - 1)] += 1; - } - } - - // Approximate percentiles from the histogram - let p_low = percentile_from_histogram(&counts, min, bin_width, count, FEATURE_PERCENTILE_LOW); - let p_high = percentile_from_histogram(&counts, min, bin_width, count, FEATURE_PERCENTILE_HIGH); - - FeatureStats { - p_low, - p_high, - histogram: Histogram { - min, - max, - bin_width, - counts, - }, - } -} - -/// Convert a polars Column to Vec using NaN for null values -fn column_to_f64_vec(c: &Column) -> Vec { - let s = c.cast(&DataType::Float64).unwrap(); - let ca = s.f64().unwrap(); - ca.into_iter().map(|v| v.unwrap_or(f64::NAN)).collect() -} - -/// Precompute H3 cell IDs for all rows at commonly used resolutions. -/// Returns a Vec indexed by resolution (0..16), where non-precomputed -/// resolutions have an empty Vec. -pub fn precompute_h3(lat: &[f64], lon: &[f64]) -> Vec> { - eprintln!( - "Precomputing H3 cells for resolutions {}..{}...", - H3_PRECOMPUTE_MIN, H3_PRECOMPUTE_MAX - ); - - let resolutions: Vec = (H3_PRECOMPUTE_MIN..=H3_PRECOMPUTE_MAX).collect(); - let computed: Vec<(u8, Vec)> = resolutions - .into_par_iter() - .map(|res| { - let h3_res = h3o::Resolution::try_from(res).unwrap(); - let cells: Vec = lat - .iter() - .zip(lon.iter()) - .map(|(&la, &lo)| { - h3o::LatLng::new(la, lo) - .map(|c| u64::from(c.to_cell(h3_res))) - .unwrap_or(0) - }) - .collect(); - eprintln!(" Resolution {} done ({} cells)", res, cells.len()); - (res, cells) - }) - .collect(); - - let mut result: Vec> = (0..16).map(|_| Vec::new()).collect(); - for (res, cells) in computed { - result[res as usize] = cells; - } - - eprintln!("H3 precomputation complete."); - result -} - -impl PropertyData { - pub fn load(parquet_path: &Path) -> Self { - eprintln!("Loading parquet from {:?}...", parquet_path); - - // Scan schema to discover numeric feature columns - let mut lf = LazyFrame::scan_parquet(parquet_path, Default::default()) - .expect("Failed to scan parquet"); - let schema = lf.collect_schema().expect("Failed to read schema"); - - let feature_names: Vec = schema - .iter() - .filter(|(name, dtype)| { - is_numeric_dtype(dtype) && !EXCLUDED_COLUMNS.contains(&name.as_str()) - }) - .map(|(name, _)| name.to_string()) - .collect(); - - let num_features = feature_names.len(); - eprintln!("Discovered {} numeric feature columns", num_features); - - // Read only the columns we need - let mut cols_needed: Vec = vec!["lat".into(), "lon".into()]; - cols_needed.extend(feature_names.iter().cloned()); - - // Add string columns (using actual column names from parquet) - let string_cols = vec![ - "pp_address", - "postcode", - "pp_property_type", - "built_form", - "current_energy_rating", - "potential_energy_rating", - ]; - - // Build selection with proper casting - let mut select_exprs: Vec = vec![]; - - // lat/lon as f64 - select_exprs.push(col("lat").cast(DataType::Float64)); - select_exprs.push(col("lon").cast(DataType::Float64)); - - // numeric features as f64 - for name in &feature_names { - select_exprs.push(col(name.as_str()).cast(DataType::Float64)); - } - - // string columns as string (check if they exist in schema) - for &s_col in &string_cols { - if schema.get(s_col).is_some() { - select_exprs.push(col(s_col).cast(DataType::String)); - } - } - - let df = LazyFrame::scan_parquet(parquet_path, Default::default()) - .expect("Failed to scan parquet") - .select(select_exprs) - .collect() - .expect("Failed to read parquet"); - - let row_count = df.height(); - eprintln!("Loaded {} rows", row_count); - - // Extract lat/lon using bulk iterator - let lat_series = df.column("lat").unwrap().cast(&DataType::Float64).unwrap(); - let lat: Vec = lat_series - .f64() - .unwrap() - .into_iter() - .map(|v| v.unwrap_or(0.0)) - .collect(); - - let lon_series = df.column("lon").unwrap().cast(&DataType::Float64).unwrap(); - let lon: Vec = lon_series - .f64() - .unwrap() - .into_iter() - .map(|v| v.unwrap_or(0.0)) - .collect(); - - // Extract feature columns (column-major, for cache-friendly histogram computation) - eprintln!("Extracting feature columns..."); - let col_major: Vec> = feature_names - .iter() - .map(|name| { - let s = df.column(name.as_str()).unwrap(); - column_to_f64_vec(s) - }) - .collect(); - - // Compute histograms in parallel (column-major is ideal for per-column iteration) - eprintln!("Computing histograms..."); - let feature_stats: Vec = col_major - .par_iter() - .enumerate() - .map(|(i, vals)| { - let stats = compute_feature_stats(vals); - eprintln!( - " {}: p{}={:.2}, p{}={:.2}, {} bins", - feature_names[i], - FEATURE_PERCENTILE_LOW, - stats.p_low, - FEATURE_PERCENTILE_HIGH, - stats.p_high, - stats.histogram.counts.len() - ); - stats - }) - .collect(); - - // Extract string columns (before permutation) - eprintln!("Extracting string columns..."); - let address_raw: Vec = if let Ok(col) = df.column("pp_address") { - col.str() - .unwrap() - .into_iter() - .map(|v| v.unwrap_or("").to_string()) - .collect() - } else { - vec![String::new(); row_count] - }; - - let postcode_raw: Vec = if let Ok(col) = df.column("postcode") { - col.str() - .unwrap() - .into_iter() - .map(|v| v.unwrap_or("").to_string()) - .collect() - } else { - vec![String::new(); row_count] - }; - - let property_type_raw: Vec = if let Ok(col) = df.column("pp_property_type") { - col.str() - .unwrap() - .into_iter() - .map(|v| v.unwrap_or("").to_string()) - .collect() - } else { - vec![String::new(); row_count] - }; - - let built_form_raw: Vec = if let Ok(col) = df.column("built_form") { - col.str() - .unwrap() - .into_iter() - .map(|v| v.unwrap_or("").to_string()) - .collect() - } else { - vec![String::new(); row_count] - }; - - let current_energy_rating_raw: Vec = - if let Ok(col) = df.column("current_energy_rating") { - col.str() - .unwrap() - .into_iter() - .map(|v| v.unwrap_or("").to_string()) - .collect() - } else { - vec![String::new(); row_count] - }; - - let potential_energy_rating_raw: Vec = - if let Ok(col) = df.column("potential_energy_rating") { - col.str() - .unwrap() - .into_iter() - .map(|v| v.unwrap_or("").to_string()) - .collect() - } else { - vec![String::new(); row_count] - }; - - // Sort all rows by spatial locality so that grid queries access - // contiguous memory (sequential reads instead of random DRAM accesses). - // Uses the same 0.01° grid cell as the spatial index for the sort key. - eprintln!("Sorting rows by spatial locality..."); - let grid_cell_size = 0.01_f64; - let min_lat_val = lat.iter().cloned().fold(f64::INFINITY, f64::min) - grid_cell_size; - let min_lon_val = lon.iter().cloned().fold(f64::INFINITY, f64::min) - grid_cell_size; - let max_lon_val = lon.iter().cloned().fold(f64::NEG_INFINITY, f64::max) + grid_cell_size; - let grid_cols = ((max_lon_val - min_lon_val) / grid_cell_size).ceil() as u64 + 1; - - let mut perm: Vec = (0..row_count as u32).collect(); - perm.sort_unstable_by_key(|&i| { - let r = ((lat[i as usize] - min_lat_val) / grid_cell_size) as u64; - let c = ((lon[i as usize] - min_lon_val) / grid_cell_size) as u64; - r * grid_cols + c - }); - - // Apply permutation to lat/lon - let lat: Vec = perm.iter().map(|&i| lat[i as usize]).collect(); - let lon: Vec = perm.iter().map(|&i| lon[i as usize]).collect(); - - // Apply permutation to string columns - let address: Vec = perm - .iter() - .map(|&i| address_raw[i as usize].clone()) - .collect(); - let postcode: Vec = perm - .iter() - .map(|&i| postcode_raw[i as usize].clone()) - .collect(); - let property_type: Vec = perm - .iter() - .map(|&i| property_type_raw[i as usize].clone()) - .collect(); - let built_form: Vec = perm - .iter() - .map(|&i| built_form_raw[i as usize].clone()) - .collect(); - let current_energy_rating: Vec = perm - .iter() - .map(|&i| current_energy_rating_raw[i as usize].clone()) - .collect(); - let potential_energy_rating: Vec = perm - .iter() - .map(|&i| potential_energy_rating_raw[i as usize].clone()) - .collect(); - - // Transpose to row-major AND apply spatial permutation in one pass. - // Result: all features for one row are contiguous, and spatially - // nearby rows are adjacent in memory. - eprintln!("Transposing to row-major layout (spatially sorted)..."); - let mut feature_data = vec![f64::NAN; row_count * num_features]; - for (new_row, &old_row) in perm.iter().enumerate() { - let old = old_row as usize; - let dst_base = new_row * num_features; - for (feat_idx, col_vec) in col_major.iter().enumerate() { - feature_data[dst_base + feat_idx] = col_vec[old]; - } - } - - eprintln!("Data loading complete."); - - PropertyData { - lat, - lon, - feature_names, - num_features, - feature_data, - feature_stats, - address, - postcode, - property_type, - built_form, - current_energy_rating, - potential_energy_rating, - } - } -} - -/// Point of Interest data -#[derive(Serialize)] -pub struct POI { - pub id: String, - pub name: String, - pub category: String, - pub lat: f64, - pub lng: f64, - pub emoji: String, -} - -/// Columnar storage for POI data -pub struct POIData { - pub id: Vec, - pub name: Vec, - pub category: Vec, - pub lat: Vec, - pub lng: Vec, - pub emoji: Vec, -} - -impl POIData { - pub fn load(parquet_path: &Path) -> Self { - eprintln!("Loading POI data from {:?}...", parquet_path); - - let df = LazyFrame::scan_parquet(parquet_path, Default::default()) - .expect("Failed to scan POI parquet") - .collect() - .expect("Failed to read POI parquet"); - - let row_count = df.height(); - eprintln!("Loaded {} POIs", row_count); - - // Extract columns - let id: Vec = df - .column("id") - .unwrap() - .str() - .unwrap() - .into_iter() - .map(|v| v.unwrap_or("").to_string()) - .collect(); - - let name: Vec = df - .column("name") - .unwrap() - .str() - .unwrap() - .into_iter() - .map(|v| v.unwrap_or("").to_string()) - .collect(); - - let category: Vec = df - .column("category") - .unwrap() - .str() - .unwrap() - .into_iter() - .map(|v| v.unwrap_or("").to_string()) - .collect(); - - let lat: Vec = df - .column("lat") - .unwrap() - .f64() - .unwrap() - .into_iter() - .map(|v| v.unwrap_or(0.0)) - .collect(); - - let lng: Vec = df - .column("lng") - .unwrap() - .f64() - .unwrap() - .into_iter() - .map(|v| v.unwrap_or(0.0)) - .collect(); - - let emoji: Vec = df - .column("emoji") - .unwrap() - .str() - .unwrap() - .into_iter() - .map(|v| v.unwrap_or("").to_string()) - .collect(); - - eprintln!("POI data loading complete."); - - POIData { - id, - name, - category, - lat, - lng, - emoji, - } - } -} diff --git a/server-rs/src/filter.rs b/server-rs/src/filter.rs new file mode 100644 index 0000000..167c1a2 --- /dev/null +++ b/server-rs/src/filter.rs @@ -0,0 +1,85 @@ +use crate::data::EnumFeatureData; + +pub struct ParsedFilter { + pub feat_idx: usize, + pub min: f64, + pub max: f64, +} + +pub struct ParsedEnumFilter { + pub enum_idx: usize, + pub allowed: Vec, +} + +/// Parse comma-separated filter string into numeric and enum filters. +/// Numeric format: `name:min:max` +/// Enum format: `name:val1|val2|val3` (pipe-separated values) +pub fn parse_filters( + filter_str: Option<&str>, + feature_names: &[String], + enum_features: &[EnumFeatureData], +) -> (Vec, Vec) { + let mut numeric = Vec::new(); + let mut enums = Vec::new(); + + let s = match filter_str.filter(|s| !s.is_empty()) { + Some(s) => s, + None => return (numeric, enums), + }; + + for entry in s.split(',') { + let parts: Vec<&str> = entry.splitn(2, ':').collect(); + if parts.len() != 2 { + continue; + } + let name = parts[0].trim(); + let rest = parts[1].trim(); + + if let Some(enum_idx) = enum_features.iter().position(|ef| ef.name == name) { + let ef = &enum_features[enum_idx]; + let allowed: Vec = rest + .split('|') + .filter_map(|v| { + let v = v.trim(); + ef.values.iter().position(|ev| ev == v).map(|i| i as u8) + }) + .collect(); + enums.push(ParsedEnumFilter { enum_idx, allowed }); + } else { + let num_parts: Vec<&str> = rest.splitn(2, ':').collect(); + if num_parts.len() != 2 { + continue; + } + let min = match num_parts[0].trim().parse::() { + Ok(v) => v, + Err(_) => continue, + }; + let max = match num_parts[1].trim().parse::() { + Ok(v) => v, + Err(_) => continue, + }; + if let Some(feat_idx) = feature_names.iter().position(|n| n == name) { + numeric.push(ParsedFilter { feat_idx, min, max }); + } + } + } + + (numeric, enums) +} + +pub fn row_passes_filters( + row: usize, + filters: &[ParsedFilter], + enum_filters: &[ParsedEnumFilter], + feature_data: &[f64], + num_features: usize, + enum_features: &[EnumFeatureData], +) -> bool { + filters.iter().all(|f| { + let v = feature_data[row * num_features + f.feat_idx]; + v.is_finite() && v >= f.min && v <= f.max + }) && enum_filters.iter().all(|ef| { + let v = enum_features[ef.enum_idx].data[row]; + v != 255 && ef.allowed.contains(&v) + }) +} diff --git a/server-rs/src/index.rs b/server-rs/src/index.rs index ee13588..03412f6 100644 --- a/server-rs/src/index.rs +++ b/server-rs/src/index.rs @@ -14,9 +14,7 @@ pub struct GridIndex { } impl GridIndex { - /// Build the grid index from lat/lon arrays. pub fn build(lat: &[f64], lon: &[f64], cell_size: f64) -> Self { - // Compute bounding box with a small margin let mut min_lat = f64::INFINITY; let mut max_lat = f64::NEG_INFINITY; let mut min_lon = f64::INFINITY; @@ -39,7 +37,6 @@ impl GridIndex { } } - // Add margin min_lat -= cell_size; min_lon -= cell_size; max_lat += cell_size; @@ -48,12 +45,12 @@ impl GridIndex { let rows = ((max_lat - min_lat) / cell_size).ceil() as usize + 1; let cols = ((max_lon - min_lon) / cell_size).ceil() as usize + 1; - eprintln!( - "Building grid index: {}x{} cells ({} total), cell_size={}", - rows, - cols, - rows * cols, - cell_size + tracing::debug!( + rows_grid = rows, + cols_grid = cols, + total_cells = rows * cols, + cell_size, + "Building grid index" ); let mut cells: Vec> = vec![Vec::new(); rows * cols]; @@ -65,7 +62,7 @@ impl GridIndex { cells[idx].push(i as u32); } - eprintln!("Grid index built."); + tracing::debug!("Grid index built"); GridIndex { min_lat, @@ -77,7 +74,6 @@ impl GridIndex { } } - /// Query all row indices within the given bounding box. pub fn query(&self, south: f64, west: f64, north: f64, east: f64) -> Vec { let (r_min, r_max, c_min, c_max) = self.clamp_bounds(south, west, north, east); diff --git a/server-rs/src/main.rs b/server-rs/src/main.rs index ccd8831..23ec229 100644 --- a/server-rs/src/main.rs +++ b/server-rs/src/main.rs @@ -1,7 +1,9 @@ mod consts; mod data; +mod filter; mod index; mod routes; +mod state; use std::path::PathBuf; use std::sync::Arc; @@ -11,41 +13,55 @@ use axum::Router; use tower_http::compression::CompressionLayer; use tower_http::cors::{Any, CorsLayer}; use tower_http::services::ServeDir; +use tower_http::trace::TraceLayer; +use tracing::info; +use tracing_subscriber::EnvFilter; -use routes::AppState; +use state::AppState; #[tokio::main] async fn main() { + tracing_subscriber::fmt() + .with_env_filter( + EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")), + ) + .with_ansi(true) + .init(); + let parquet_path = PathBuf::from( std::env::args() .nth(1) .unwrap_or_else(|| "data_sources/processed/wide.parquet".to_string()), ); if !parquet_path.exists() { - eprintln!("Error: {} not found.", parquet_path.display()); + tracing::error!("Parquet file not found: {}", parquet_path.display()); std::process::exit(1); } - // Load property data and build indices + info!("Loading property data from {}", parquet_path.display()); let property_data = data::PropertyData::load(&parquet_path); + info!( + rows = property_data.lat.len(), + features = property_data.num_features, + enums = property_data.enum_features.len(), + "Property data loaded" + ); + + info!("Building spatial grid index (0.01° cells)"); let grid = index::GridIndex::build(&property_data.lat, &property_data.lon, 0.01); + + info!("Precomputing H3 cells for resolutions {}-{}", consts::H3_PRECOMPUTE_MIN, consts::H3_PRECOMPUTE_MAX); let h3_cells = data::precompute_h3(&property_data.lat, &property_data.lon); - // Load POI data and build spatial index - // Derive POI path from the data parquet path (same directory) - let poi_path = parquet_path - .parent() - .and_then(|p| p.parent()) - .map(|p| p.join("filtered_uk_pois.parquet")) - .unwrap_or_else(|| PathBuf::from("data_sources/filtered_uk_pois.parquet")); + let poi_path = PathBuf::from("/volumes/syncthing/Projects/property-map/data/filtered_uk_pois.parquet"); let poi_data = if poi_path.exists() { - data::POIData::load(&poi_path) + info!("Loading POI data from {}", poi_path.display()); + let pd = data::POIData::load(&poi_path); + info!(pois = pd.lat.len(), "POI data loaded"); + pd } else { - eprintln!( - "Warning: {} not found. POI endpoints will be unavailable.", - poi_path.display() - ); + tracing::warn!("POI file not found: {}. POI endpoints will be unavailable.", poi_path.display()); data::POIData { id: Vec::new(), name: Vec::new(), @@ -55,6 +71,8 @@ async fn main() { emoji: Vec::new(), } }; + + info!("Building POI spatial grid index"); let poi_grid = index::GridIndex::build(&poi_data.lat, &poi_data.lng, 0.01); let state = Arc::new(AppState { @@ -70,7 +88,6 @@ async fn main() { .allow_methods(Any) .allow_headers(Any); - // API routes let state_features = state.clone(); let state_hexagons = state.clone(); let state_pois = state.clone(); @@ -101,7 +118,6 @@ async fn main() { }), ); - // Static file serving for frontend let frontend_dist = PathBuf::from("frontend/dist"); let app = if frontend_dist.exists() { api.fallback_service(ServeDir::new(frontend_dist)) @@ -109,10 +125,13 @@ async fn main() { api }; - let app = app.layer(cors).layer(CompressionLayer::new().gzip(true)); + let app = app + .layer(cors) + .layer(CompressionLayer::new().gzip(true)) + .layer(TraceLayer::new_for_http()); let addr = "0.0.0.0:8001"; - eprintln!("Server listening on {}", addr); + info!("Server listening on {}", addr); let listener = tokio::net::TcpListener::bind(addr).await.unwrap(); axum::serve(listener, app).await.unwrap(); diff --git a/server-rs/src/routes.rs b/server-rs/src/routes.rs deleted file mode 100644 index 509f832..0000000 --- a/server-rs/src/routes.rs +++ /dev/null @@ -1,636 +0,0 @@ -use std::fmt::Write; -use std::str::FromStr; -use std::sync::Arc; - -use axum::extract::Query; -use axum::http::StatusCode; -use axum::response::{IntoResponse, Json}; -use rustc_hash::FxHashMap; -use serde::{Deserialize, Serialize}; - -use crate::consts::{H3_PRECOMPUTE_MAX, H3_PRECOMPUTE_MIN}; -use crate::data::{Histogram, POIData, PropertyData, POI}; -use crate::index::GridIndex; - -/// Shared application state -pub struct AppState { - pub data: PropertyData, - pub grid: GridIndex, - /// h3_cells[resolution][row_idx] = precomputed H3 cell ID. - /// Empty Vec for resolutions not precomputed. - pub h3_cells: Vec>, - pub poi_data: POIData, - pub poi_grid: GridIndex, -} - -const BOUNDS_BUFFER_PERCENT: f64 = 0.2; - -// ── /api/features ── - -#[derive(Serialize)] -pub struct FeatureInfo { - name: String, - min: f64, - max: f64, - label: String, - histogram: Histogram, -} - -#[derive(Serialize)] -pub struct FeaturesResponse { - features: Vec, -} - -fn snake_to_label(name: &str) -> String { - name.split('_') - .map(|word| { - let mut chars = word.chars(); - match chars.next() { - None => String::new(), - Some(c) => { - let mut s = c.to_uppercase().to_string(); - s.extend(chars); - s - } - } - }) - .collect::>() - .join(" ") -} - -pub async fn get_features(state: Arc) -> Json { - let features = state - .data - .feature_names - .iter() - .enumerate() - .map(|(i, name): (usize, &String)| { - let stats = &state.data.feature_stats[i]; - FeatureInfo { - name: name.clone(), - min: stats.p_low, - max: stats.p_high, - label: snake_to_label(name), - histogram: stats.histogram.clone(), - } - }) - .collect(); - - Json(FeaturesResponse { features }) -} - -// ── /api/hexagons ── - -#[derive(Deserialize)] -pub struct HexagonParams { - resolution: u8, - bounds: Option, - /// Comma-separated filters: `name:min:max,...` - /// Rows must have non-NaN values within [min,max] for each filter. - filters: Option, -} - -struct ParsedFilter { - feat_idx: usize, - min: f64, - max: f64, -} - -/// Per-cell accumulator for aggregating features -struct CellAgg { - count: u32, - mins: Vec, - maxs: Vec, -} - -impl CellAgg { - fn new(num_features: usize) -> Self { - CellAgg { - count: 0, - mins: vec![f64::INFINITY; num_features], - maxs: vec![f64::NEG_INFINITY; num_features], - } - } - - /// Add a row using row-major feature_data layout. - /// feature_data[row * num_features + feat_idx] — all features for one row - /// are contiguous, so this reads a single cache line per ~8 features. - #[inline] - fn add_row(&mut self, feature_data: &[f64], row: usize, num_features: usize) { - self.count += 1; - let base = row * num_features; - let row_slice = &feature_data[base..base + num_features]; - for (i, &v) in row_slice.iter().enumerate() { - if v.is_finite() { - if v < self.mins[i] { - self.mins[i] = v; - } - if v > self.maxs[i] { - self.maxs[i] = v; - } - } - } - } -} - -/// Write the hexagons JSON response directly to a String buffer, -/// avoiding serde_json::Value allocations entirely. -fn write_hexagons_json( - buf: &mut String, - groups: &FxHashMap, - min_keys: &[String], - max_keys: &[String], - num_features: usize, -) { - buf.push_str("{\"features\":["); - let mut first = true; - for (&cell_id, agg) in groups { - if !first { - buf.push(','); - } - first = false; - - let cell = h3o::CellIndex::try_from(cell_id).unwrap(); - write!(buf, "{{\"h3\":\"{}\",\"count\":{}", cell, agg.count).unwrap(); - - for i in 0..num_features { - if agg.mins[i] != f64::INFINITY { - write!( - buf, - ",\"{}\":{},\"{}\":{}", - min_keys[i], agg.mins[i], max_keys[i], agg.maxs[i] - ) - .unwrap(); - } - } - buf.push('}'); - } - buf.push_str("]}"); -} - -pub async fn get_hexagons( - state: Arc, - Query(params): Query, -) -> Result { - let resolution = params.resolution; - if resolution < H3_PRECOMPUTE_MIN || resolution > H3_PRECOMPUTE_MAX { - return Err(( - StatusCode::BAD_REQUEST, - format!( - "resolution must be between {} and {}", - H3_PRECOMPUTE_MIN, H3_PRECOMPUTE_MAX - ), - )); - } - - let bounds_str = params.bounds.ok_or(( - StatusCode::BAD_REQUEST, - "bounds parameter is required".into(), - ))?; - - let parts: Vec = bounds_str - .split(',') - .map(|s| s.trim().parse::()) - .collect::, _>>() - .map_err(|_| { - ( - StatusCode::BAD_REQUEST, - "Invalid bounds format. Use: south,west,north,east".into(), - ) - })?; - - if parts.len() != 4 { - return Err(( - StatusCode::BAD_REQUEST, - "Invalid bounds format. Use: south,west,north,east".into(), - )); - } - - let (mut south, mut west, mut north, mut east) = (parts[0], parts[1], parts[2], parts[3]); - - // Apply bounds buffer (20%) - let lat_range = north - south; - let lng_range = east - west; - south -= lat_range * BOUNDS_BUFFER_PERCENT; - north += lat_range * BOUNDS_BUFFER_PERCENT; - west -= lng_range * BOUNDS_BUFFER_PERCENT; - east += lng_range * BOUNDS_BUFFER_PERCENT; - - // Quantize to 0.01 degree precision - let precision = 0.01; - south = (south / precision).floor() * precision; - west = (west / precision).floor() * precision; - north = (north / precision).ceil() * precision; - east = (east / precision).ceil() * precision; - - // Parse filters: `name:min:max,...` - let parsed_filters: Vec = params - .filters - .as_deref() - .filter(|s| !s.is_empty()) - .map(|s| { - s.split(',') - .filter_map(|entry| { - let parts: Vec<&str> = entry.splitn(3, ':').collect(); - if parts.len() != 3 { - return None; - } - let name = parts[0].trim(); - let min = parts[1].trim().parse::().ok()?; - let max = parts[2].trim().parse::().ok()?; - let feat_idx = state.data.feature_names.iter().position(|n| n == name)?; - Some(ParsedFilter { feat_idx, min, max }) - }) - .collect() - }) - .unwrap_or_default(); - - // Move CPU-heavy work off the async executor - let json_body = tokio::task::spawn_blocking(move || { - let t0 = std::time::Instant::now(); - - let num_features = state.data.num_features; - let feature_data = &state.data.feature_data; - - // Pre-compute JSON key strings once - let min_keys: Vec = state - .data - .feature_names - .iter() - .map(|n| format!("min_{}", n)) - .collect(); - let max_keys: Vec = state - .data - .feature_names - .iter() - .map(|n| format!("max_{}", n)) - .collect(); - - // Use precomputed H3 cells if available - let h3_cells_for_res: Option<&[u64]> = state - .h3_cells - .get(resolution as usize) - .filter(|v| !v.is_empty()) - .map(|v| v.as_slice()); - - // Aggregate using FxHashMap (fast non-crypto hash for integer keys) - // and grid visitor (no intermediate Vec allocation) - let mut groups: FxHashMap = FxHashMap::default(); - - // Row-level filter check: value must be non-NaN and within [min, max] - let row_passes = |row: usize| -> bool { - parsed_filters.iter().all(|f| { - let v = feature_data[row * num_features + f.feat_idx]; - v.is_finite() && v >= f.min && v <= f.max - }) - }; - - if let Some(precomputed) = h3_cells_for_res { - // Fast path: precomputed H3 + visitor pattern - state - .grid - .for_each_in_bounds(south, west, north, east, |row_idx| { - let row = row_idx as usize; - if !row_passes(row) { - return; - } - let cell_id = precomputed[row]; - groups - .entry(cell_id) - .or_insert_with(|| CellAgg::new(num_features)) - .add_row(feature_data, row, num_features); - }); - } else { - // Fallback: compute H3 on-the-fly - let h3_res = h3o::Resolution::try_from(resolution).unwrap(); - state - .grid - .for_each_in_bounds(south, west, north, east, |row_idx| { - let row = row_idx as usize; - if !row_passes(row) { - return; - } - let cell_id = h3o::LatLng::new(state.data.lat[row], state.data.lon[row]) - .map(|c| u64::from(c.to_cell(h3_res))) - .unwrap_or(0); - groups - .entry(cell_id) - .or_insert_with(|| CellAgg::new(num_features)) - .add_row(feature_data, row, num_features); - }); - } - - let t_agg = t0.elapsed(); - - // Write JSON directly (no serde_json::Value allocation overhead) - let mut json_buf = String::with_capacity(groups.len() * 128); - write_hexagons_json(&mut json_buf, &groups, &min_keys, &max_keys, num_features); - - let t_total = t0.elapsed(); - eprintln!( - "hexagons: res={} cells={} agg={:?} json={:?} total={:?} bytes={}", - resolution, - groups.len(), - t_agg, - t_total - t_agg, - t_total, - json_buf.len() - ); - - json_buf - }) - .await - .unwrap(); - - Ok(([("content-type", "application/json")], json_body)) -} - -// ── /api/pois ── - -#[derive(Deserialize)] -pub struct POIParams { - bounds: Option, - /// Comma-separated list of categories to filter by - categories: Option, -} - -#[derive(Serialize)] -pub struct POIsResponse { - pois: Vec, -} - -pub async fn get_pois( - state: Arc, - Query(params): Query, -) -> Result, (StatusCode, String)> { - let bounds_str = params.bounds.ok_or(( - StatusCode::BAD_REQUEST, - "bounds parameter is required".into(), - ))?; - - let parts: Vec = bounds_str - .split(',') - .map(|s| s.trim().parse::()) - .collect::, _>>() - .map_err(|_| { - ( - StatusCode::BAD_REQUEST, - "Invalid bounds format. Use: south,west,north,east".into(), - ) - })?; - - if parts.len() != 4 { - return Err(( - StatusCode::BAD_REQUEST, - "Invalid bounds format. Use: south,west,north,east".into(), - )); - } - - let (south, west, north, east) = (parts[0], parts[1], parts[2], parts[3]); - - // Parse category filter if provided - let category_filter: Option> = params - .categories - .as_deref() - .filter(|s| !s.is_empty()) - .map(|s| s.split(',').map(|c| c.trim().to_string()).collect()); - - // Move CPU-heavy work off the async executor - let result = tokio::task::spawn_blocking(move || { - // Spatial query using grid index - let row_indices = state.poi_grid.query(south, west, north, east); - - let pois: Vec = row_indices - .iter() - .filter_map(|&row_idx| { - let row = row_idx as usize; - - // Apply category filter if specified - if let Some(ref categories) = category_filter { - if !categories.contains(&state.poi_data.category[row]) { - return None; - } - } - - Some(POI { - id: state.poi_data.id[row].clone(), - name: state.poi_data.name[row].clone(), - category: state.poi_data.category[row].clone(), - lat: state.poi_data.lat[row], - lng: state.poi_data.lng[row], - emoji: state.poi_data.emoji[row].clone(), - }) - }) - .take(5000) - .collect(); - - POIsResponse { pois } - }) - .await - .unwrap(); - - Ok(Json(result)) -} - -// ── /api/poi-categories ── - -#[derive(Serialize)] -pub struct POICategoriesResponse { - categories: Vec, -} - -pub async fn get_poi_categories(state: Arc) -> Json { - // Compute unique categories - let result = tokio::task::spawn_blocking(move || { - let mut categories: Vec = state - .poi_data - .category - .iter() - .cloned() - .collect::>() - .into_iter() - .collect(); - - categories.sort(); - - POICategoriesResponse { categories } - }) - .await - .unwrap(); - - Json(result) -} - -// ── /api/hexagon-properties ── - -#[derive(Deserialize)] -pub struct HexagonPropertiesParams { - pub h3: String, - pub resolution: u8, - pub filters: Option, - pub limit: Option, - pub offset: Option, -} - -#[derive(Serialize)] -pub struct Property { - // String fields - pub address: Option, - pub postcode: Option, - pub property_type: Option, - pub built_form: Option, - pub current_energy_rating: Option, - pub potential_energy_rating: Option, - - // Numeric fields - pub lat: f64, - pub lon: f64, - - // All other numeric features stored as dynamic map - #[serde(flatten)] - pub features: FxHashMap, -} - -#[derive(Serialize)] -pub struct HexagonPropertiesResponse { - pub properties: Vec, - pub total: usize, - pub limit: usize, - pub offset: usize, - pub truncated: bool, -} - -/// Helper function to check if a row passes all filters -fn row_passes_filters( - row: usize, - filters: &[ParsedFilter], - feature_data: &[f64], - num_features: usize, -) -> bool { - filters.iter().all(|f| { - let v = feature_data[row * num_features + f.feat_idx]; - v.is_finite() && v >= f.min && v <= f.max - }) -} - -pub async fn get_hexagon_properties( - state: Arc, - Query(params): Query, -) -> Result, (StatusCode, String)> { - // 1. Parse H3 cell ID - let cell = h3o::CellIndex::from_str(¶ms.h3) - .map_err(|e| (StatusCode::BAD_REQUEST, format!("Invalid H3 cell: {}", e)))?; - let cell_u64: u64 = cell.into(); - - // 2. Validate resolution - let resolution = params.resolution as usize; - if resolution >= state.h3_cells.len() || state.h3_cells[resolution].is_empty() { - return Err(( - StatusCode::BAD_REQUEST, - "Invalid or non-precomputed resolution".to_string(), - )); - } - - // 3. Parse filters (reuse existing filter parsing logic from get_hexagons) - let parsed_filters: Vec = params - .filters - .as_deref() - .filter(|s| !s.is_empty()) - .map(|s| { - s.split(',') - .filter_map(|entry| { - let parts: Vec<&str> = entry.splitn(3, ':').collect(); - if parts.len() != 3 { - return None; - } - let name = parts[0].trim(); - let min = parts[1].trim().parse::().ok()?; - let max = parts[2].trim().parse::().ok()?; - let feat_idx = state.data.feature_names.iter().position(|n| n == name)?; - Some(ParsedFilter { feat_idx, min, max }) - }) - .collect() - }) - .unwrap_or_default(); - - // Move CPU-heavy work off the async executor - let result = tokio::task::spawn_blocking(move || { - let h3_data = &state.h3_cells[resolution]; - let num_features = state.data.num_features; - let feature_data = &state.data.feature_data; - - // 4. Find all rows with matching H3 cell - let matching_rows: Vec = h3_data - .iter() - .enumerate() - .filter_map(|(idx, &h3_cell)| { - if h3_cell == cell_u64 { - // Apply feature filters - if row_passes_filters(idx, &parsed_filters, feature_data, num_features) { - Some(idx) - } else { - None - } - } else { - None - } - }) - .collect(); - - let total = matching_rows.len(); - let limit = params.limit.unwrap_or(100).min(500); - let offset = params.offset.unwrap_or(0); - let truncated = total > offset + limit; - - // 5. Extract properties for paginated subset - let properties: Vec = matching_rows - .iter() - .skip(offset) - .take(limit) - .map(|&row| { - // Build dynamic features map - let mut features = FxHashMap::default(); - let base = row * num_features; - for (feat_idx, feat_name) in state.data.feature_names.iter().enumerate() { - let v = feature_data[base + feat_idx]; - if v.is_finite() { - features.insert(feat_name.clone(), v); - } - } - - // Helper to get non-empty string - let get_string = |s: &str| -> Option { - if s.is_empty() { - None - } else { - Some(s.to_string()) - } - }; - - Property { - address: get_string(&state.data.address[row]), - postcode: get_string(&state.data.postcode[row]), - property_type: get_string(&state.data.property_type[row]), - built_form: get_string(&state.data.built_form[row]), - current_energy_rating: get_string(&state.data.current_energy_rating[row]), - potential_energy_rating: get_string(&state.data.potential_energy_rating[row]), - lat: state.data.lat[row], - lon: state.data.lon[row], - features, - } - }) - .collect(); - - HexagonPropertiesResponse { - properties, - total, - limit, - offset, - truncated, - } - }) - .await - .unwrap(); - - Ok(Json(result)) -} diff --git a/server-rs/src/routes/features.rs b/server-rs/src/routes/features.rs new file mode 100644 index 0000000..57a74ce --- /dev/null +++ b/server-rs/src/routes/features.rs @@ -0,0 +1,87 @@ +use std::sync::Arc; + +use axum::response::Json; +use serde::Serialize; +use tracing::info; + +use crate::data::Histogram; +use crate::state::AppState; + +#[derive(Serialize)] +#[serde(tag = "type")] +pub enum FeatureInfo { + #[serde(rename = "numeric")] + Numeric { + name: String, + label: String, + min: f64, + max: f64, + histogram: Histogram, + }, + #[serde(rename = "enum")] + Enum { + name: String, + label: String, + values: Vec, + }, +} + +#[derive(Serialize)] +pub struct FeaturesResponse { + features: Vec, +} + +fn snake_to_label(name: &str) -> String { + // If name contains '/' or uppercase, assume it's already human-readable + if name.contains('/') || name.chars().any(|c| c.is_uppercase()) { + return name.to_string(); + } + name.split('_') + .map(|word| { + let mut chars = word.chars(); + match chars.next() { + None => String::new(), + Some(c) => { + let mut s = c.to_uppercase().to_string(); + s.extend(chars); + s + } + } + }) + .collect::>() + .join(" ") +} + +pub async fn get_features(state: Arc) -> Json { + let mut features: Vec = state + .data + .feature_names + .iter() + .enumerate() + .map(|(i, name): (usize, &String)| { + let stats = &state.data.feature_stats[i]; + FeatureInfo::Numeric { + name: name.clone(), + label: snake_to_label(name), + min: stats.p_low, + max: stats.p_high, + histogram: stats.histogram.clone(), + } + }) + .collect(); + + for ef in &state.data.enum_features { + features.push(FeatureInfo::Enum { + name: ef.name.clone(), + label: snake_to_label(&ef.name), + values: ef.values.clone(), + }); + } + + info!( + numeric = features.iter().filter(|f| matches!(f, FeatureInfo::Numeric { .. })).count(), + enums = features.iter().filter(|f| matches!(f, FeatureInfo::Enum { .. })).count(), + "GET /api/features" + ); + Json(FeaturesResponse { features }) +} diff --git a/server-rs/src/routes/hexagons.rs b/server-rs/src/routes/hexagons.rs new file mode 100644 index 0000000..11cb3ed --- /dev/null +++ b/server-rs/src/routes/hexagons.rs @@ -0,0 +1,257 @@ +use std::fmt::Write; +use std::sync::Arc; + +use axum::extract::Query; +use axum::http::StatusCode; +use axum::response::IntoResponse; +use rustc_hash::FxHashMap; +use serde::Deserialize; +use tracing::{info, warn}; + +use crate::consts::{H3_PRECOMPUTE_MAX, H3_PRECOMPUTE_MIN}; +use crate::filter::parse_filters; +use crate::state::AppState; + +const BOUNDS_BUFFER_PERCENT: f64 = 0.2; + +#[derive(Deserialize)] +pub struct HexagonParams { + resolution: u8, + bounds: Option, + /// Comma-separated filters: `name:min:max,...` + /// Rows must have non-NaN values within [min,max] for each filter. + filters: Option, +} + +/// Per-cell accumulator for aggregating features +struct CellAgg { + count: u32, + mins: Vec, + maxs: Vec, +} + +impl CellAgg { + fn new(num_features: usize) -> Self { + CellAgg { + count: 0, + mins: vec![f64::INFINITY; num_features], + maxs: vec![f64::NEG_INFINITY; num_features], + } + } + + /// Add a row using row-major feature_data layout. + /// feature_data[row * num_features + feat_idx] — all features for one row + /// are contiguous, so this reads a single cache line per ~8 features. + #[inline] + fn add_row(&mut self, feature_data: &[f64], row: usize, num_features: usize) { + self.count += 1; + let base = row * num_features; + let row_slice = &feature_data[base..base + num_features]; + for (i, &v) in row_slice.iter().enumerate() { + if v.is_finite() { + if v < self.mins[i] { + self.mins[i] = v; + } + if v > self.maxs[i] { + self.maxs[i] = v; + } + } + } + } +} + +/// Write the hexagons JSON response directly to a String buffer, +/// avoiding serde_json::Value allocations entirely. +fn write_hexagons_json( + buf: &mut String, + groups: &FxHashMap, + min_keys: &[String], + max_keys: &[String], + num_features: usize, +) { + buf.push_str("{\"features\":["); + let mut first = true; + for (&cell_id, agg) in groups { + if !first { + buf.push(','); + } + first = false; + + let cell = h3o::CellIndex::try_from(cell_id).unwrap(); + write!(buf, "{{\"h3\":\"{}\",\"count\":{}", cell, agg.count).unwrap(); + + for i in 0..num_features { + if agg.mins[i] != f64::INFINITY { + write!( + buf, + ",\"{}\":{},\"{}\":{}", + min_keys[i], agg.mins[i], max_keys[i], agg.maxs[i] + ) + .unwrap(); + } + } + buf.push('}'); + } + buf.push_str("]}"); +} + +pub async fn get_hexagons( + state: Arc, + Query(params): Query, +) -> Result { + let resolution = params.resolution; + if resolution < H3_PRECOMPUTE_MIN || resolution > H3_PRECOMPUTE_MAX { + warn!(resolution, "Resolution out of range [{}, {}]", H3_PRECOMPUTE_MIN, H3_PRECOMPUTE_MAX); + return Err(( + StatusCode::BAD_REQUEST, + format!( + "resolution must be between {} and {}", + H3_PRECOMPUTE_MIN, H3_PRECOMPUTE_MAX + ), + )); + } + + let bounds_str = params.bounds.ok_or(( + StatusCode::BAD_REQUEST, + "bounds parameter is required".into(), + ))?; + + let parts: Vec = bounds_str + .split(',') + .map(|s| s.trim().parse::()) + .collect::, _>>() + .map_err(|_| { + ( + StatusCode::BAD_REQUEST, + "Invalid bounds format. Use: south,west,north,east".into(), + ) + })?; + + if parts.len() != 4 { + return Err(( + StatusCode::BAD_REQUEST, + "Invalid bounds format. Use: south,west,north,east".into(), + )); + } + + let (mut south, mut west, mut north, mut east) = (parts[0], parts[1], parts[2], parts[3]); + + let lat_range = north - south; + let lng_range = east - west; + south -= lat_range * BOUNDS_BUFFER_PERCENT; + north += lat_range * BOUNDS_BUFFER_PERCENT; + west -= lng_range * BOUNDS_BUFFER_PERCENT; + east += lng_range * BOUNDS_BUFFER_PERCENT; + + let precision = 0.01; + south = (south / precision).floor() * precision; + west = (west / precision).floor() * precision; + north = (north / precision).ceil() * precision; + east = (east / precision).ceil() * precision; + + let filters_str = params.filters.clone(); + let (parsed_filters, parsed_enum_filters) = parse_filters( + params.filters.as_deref(), + &state.data.feature_names, + &state.data.enum_features, + ); + let num_filters = parsed_filters.len() + parsed_enum_filters.len(); + + let json_body = tokio::task::spawn_blocking(move || { + let t0 = std::time::Instant::now(); + + let num_features = state.data.num_features; + let feature_data = &state.data.feature_data; + + let min_keys: Vec = state + .data + .feature_names + .iter() + .map(|n| format!("min_{}", n)) + .collect(); + let max_keys: Vec = state + .data + .feature_names + .iter() + .map(|n| format!("max_{}", n)) + .collect(); + + let h3_cells_for_res: Option<&[u64]> = state + .h3_cells + .get(resolution as usize) + .filter(|v| !v.is_empty()) + .map(|v| v.as_slice()); + + let mut groups: FxHashMap = FxHashMap::default(); + + let enum_features = &state.data.enum_features; + + // Row-level filter check: numeric must be non-NaN and within [min, max], + // enum must have value index in the allowed set + let row_passes = |row: usize| -> bool { + parsed_filters.iter().all(|f| { + let v = feature_data[row * num_features + f.feat_idx]; + v.is_finite() && v >= f.min && v <= f.max + }) && parsed_enum_filters.iter().all(|ef| { + let v = enum_features[ef.enum_idx].data[row]; + v != 255 && ef.allowed.contains(&v) + }) + }; + + if let Some(precomputed) = h3_cells_for_res { + state + .grid + .for_each_in_bounds(south, west, north, east, |row_idx| { + let row = row_idx as usize; + if !row_passes(row) { + return; + } + let cell_id = precomputed[row]; + groups + .entry(cell_id) + .or_insert_with(|| CellAgg::new(num_features)) + .add_row(feature_data, row, num_features); + }); + } else { + let h3_res = h3o::Resolution::try_from(resolution).unwrap(); + state + .grid + .for_each_in_bounds(south, west, north, east, |row_idx| { + let row = row_idx as usize; + if !row_passes(row) { + return; + } + let cell_id = h3o::LatLng::new(state.data.lat[row], state.data.lon[row]) + .map(|c| u64::from(c.to_cell(h3_res))) + .unwrap_or(0); + groups + .entry(cell_id) + .or_insert_with(|| CellAgg::new(num_features)) + .add_row(feature_data, row, num_features); + }); + } + + let t_agg = t0.elapsed(); + + let mut json_buf = String::with_capacity(groups.len() * 128); + write_hexagons_json(&mut json_buf, &groups, &min_keys, &max_keys, num_features); + + let t_total = t0.elapsed(); + info!( + resolution, + cells = groups.len(), + filters = num_filters, + filters_raw = filters_str.as_deref().unwrap_or("-"), + agg_ms = format_args!("{:.1}", t_agg.as_secs_f64() * 1000.0), + total_ms = format_args!("{:.1}", t_total.as_secs_f64() * 1000.0), + bytes = json_buf.len(), + "GET /api/hexagons" + ); + + json_buf + }) + .await + .unwrap(); + + Ok(([("content-type", "application/json")], json_body)) +} diff --git a/server-rs/src/routes/mod.rs b/server-rs/src/routes/mod.rs new file mode 100644 index 0000000..d33f787 --- /dev/null +++ b/server-rs/src/routes/mod.rs @@ -0,0 +1,9 @@ +mod features; +mod hexagons; +mod pois; +mod properties; + +pub use features::get_features; +pub use hexagons::get_hexagons; +pub use pois::{get_poi_categories, get_pois}; +pub use properties::get_hexagon_properties; diff --git a/server-rs/src/routes/pois.rs b/server-rs/src/routes/pois.rs new file mode 100644 index 0000000..f309ab7 --- /dev/null +++ b/server-rs/src/routes/pois.rs @@ -0,0 +1,133 @@ +use std::sync::Arc; + +use axum::extract::Query; +use axum::http::StatusCode; +use axum::response::Json; +use serde::{Deserialize, Serialize}; +use tracing::info; + +use crate::data::POI; +use crate::state::AppState; + +#[derive(Deserialize)] +pub struct POIParams { + bounds: Option, + /// Comma-separated list of categories to filter by + categories: Option, +} + +#[derive(Serialize)] +pub struct POIsResponse { + pois: Vec, +} + +pub async fn get_pois( + state: Arc, + Query(params): Query, +) -> Result, (StatusCode, String)> { + let bounds_str = params.bounds.ok_or(( + StatusCode::BAD_REQUEST, + "bounds parameter is required".into(), + ))?; + + let parts: Vec = bounds_str + .split(',') + .map(|s| s.trim().parse::()) + .collect::, _>>() + .map_err(|_| { + ( + StatusCode::BAD_REQUEST, + "Invalid bounds format. Use: south,west,north,east".into(), + ) + })?; + + if parts.len() != 4 { + return Err(( + StatusCode::BAD_REQUEST, + "Invalid bounds format. Use: south,west,north,east".into(), + )); + } + + let (south, west, north, east) = (parts[0], parts[1], parts[2], parts[3]); + + let categories_str = params.categories.clone(); + let category_filter: Option> = params + .categories + .as_deref() + .filter(|s| !s.is_empty()) + .map(|s| s.split(',').map(|c| c.trim().to_string()).collect()); + + let num_categories = category_filter.as_ref().map(|c| c.len()).unwrap_or(0); + + let result = tokio::task::spawn_blocking(move || { + let t0 = std::time::Instant::now(); + let row_indices = state.poi_grid.query(south, west, north, east); + + let pois: Vec = row_indices + .iter() + .filter_map(|&row_idx| { + let row = row_idx as usize; + + if let Some(ref categories) = category_filter { + if !categories.contains(&state.poi_data.category[row]) { + return None; + } + } + + Some(POI { + id: state.poi_data.id[row].clone(), + name: state.poi_data.name[row].clone(), + category: state.poi_data.category[row].clone(), + lat: state.poi_data.lat[row], + lng: state.poi_data.lng[row], + emoji: state.poi_data.emoji[row].clone(), + }) + }) + .take(5000) + .collect(); + + let elapsed = t0.elapsed(); + info!( + results = pois.len(), + candidates = row_indices.len(), + categories = num_categories, + categories_raw = categories_str.as_deref().unwrap_or("-"), + ms = format_args!("{:.1}", elapsed.as_secs_f64() * 1000.0), + "GET /api/pois" + ); + + POIsResponse { pois } + }) + .await + .unwrap(); + + Ok(Json(result)) +} + +#[derive(Serialize)] +pub struct POICategoriesResponse { + categories: Vec, +} + +pub async fn get_poi_categories(state: Arc) -> Json { + let result = tokio::task::spawn_blocking(move || { + let mut categories: Vec = state + .poi_data + .category + .iter() + .cloned() + .collect::>() + .into_iter() + .collect(); + + categories.sort(); + + info!(count = categories.len(), "GET /api/poi-categories"); + + POICategoriesResponse { categories } + }) + .await + .unwrap(); + + Json(result) +} diff --git a/server-rs/src/routes/properties.rs b/server-rs/src/routes/properties.rs new file mode 100644 index 0000000..1f205cb --- /dev/null +++ b/server-rs/src/routes/properties.rs @@ -0,0 +1,198 @@ +use std::str::FromStr; +use std::sync::Arc; + +use axum::extract::Query; +use axum::http::StatusCode; +use axum::response::Json; +use rustc_hash::FxHashMap; +use serde::{Deserialize, Serialize}; +use tracing::{info, warn}; + +use crate::filter::{parse_filters, row_passes_filters}; +use crate::state::AppState; + +#[derive(Deserialize)] +pub struct HexagonPropertiesParams { + pub h3: String, + pub resolution: u8, + pub filters: Option, + pub limit: Option, + pub offset: Option, +} + +#[derive(Serialize)] +pub struct Property { + // String fields + pub address: Option, + pub postcode: Option, + pub property_type: Option, + pub built_form: Option, + pub duration: Option, + pub current_energy_rating: Option, + pub potential_energy_rating: Option, + + // Numeric fields + pub lat: f64, + pub lon: f64, + + #[serde(flatten)] + pub features: FxHashMap, +} + +#[derive(Serialize)] +pub struct HexagonPropertiesResponse { + pub properties: Vec, + pub total: usize, + pub limit: usize, + pub offset: usize, + pub truncated: bool, +} + +pub async fn get_hexagon_properties( + state: Arc, + Query(params): Query, +) -> Result, (StatusCode, String)> { + let cell = h3o::CellIndex::from_str(¶ms.h3) + .map_err(|e| { + warn!(h3 = %params.h3, error = %e, "Invalid H3 cell index"); + (StatusCode::BAD_REQUEST, format!("Invalid H3 cell: {}", e)) + })?; + let cell_u64: u64 = cell.into(); + + let resolution = params.resolution as usize; + if resolution >= state.h3_cells.len() || state.h3_cells[resolution].is_empty() { + warn!(resolution, "Invalid or non-precomputed resolution for hexagon-properties"); + return Err(( + StatusCode::BAD_REQUEST, + "Invalid or non-precomputed resolution".to_string(), + )); + } + + let h3_str = params.h3.clone(); + let filters_str = params.filters.clone(); + let (parsed_filters, parsed_enum_filters) = parse_filters( + params.filters.as_deref(), + &state.data.feature_names, + &state.data.enum_features, + ); + let num_filters = parsed_filters.len() + parsed_enum_filters.len(); + + let result = tokio::task::spawn_blocking(move || { + let t0 = std::time::Instant::now(); + let h3_data = &state.h3_cells[resolution]; + let num_features = state.data.num_features; + let feature_data = &state.data.feature_data; + let enum_features = &state.data.enum_features; + + let matching_rows: Vec = h3_data + .iter() + .enumerate() + .filter_map(|(idx, &h3_cell)| { + if h3_cell == cell_u64 { + if row_passes_filters( + idx, + &parsed_filters, + &parsed_enum_filters, + feature_data, + num_features, + enum_features, + ) { + Some(idx) + } else { + None + } + } else { + None + } + }) + .collect(); + + let total = matching_rows.len(); + let limit = params.limit.unwrap_or(100).min(500); + let offset = params.offset.unwrap_or(0); + let truncated = total > offset + limit; + + let properties: Vec = matching_rows + .iter() + .skip(offset) + .take(limit) + .map(|&row| { + let mut features = FxHashMap::default(); + let base = row * num_features; + for (feat_idx, feat_name) in state.data.feature_names.iter().enumerate() { + let v = feature_data[base + feat_idx]; + if v.is_finite() { + features.insert(feat_name.clone(), v); + } + } + + let get_string = |s: &str| -> Option { + let trimmed = s.trim(); + if trimmed.is_empty() { + None + } else { + Some(trimmed.to_string()) + } + }; + + let get_enum_value = |names: &[&str]| -> Option { + for name in names { + if let Some(val) = enum_features.iter().find_map(|ef| { + if ef.name == *name { + let idx = ef.data[row]; + if idx == 255 { + None + } else { + ef.values.get(idx as usize).cloned() + } + } else { + None + } + }) { + return Some(val); + } + } + None + }; + + Property { + address: get_string(&state.data.address[row]), + postcode: get_string(&state.data.postcode[row]), + property_type: get_enum_value(&["Property type", "epc_property_type", "pp_property_type"]), + built_form: get_enum_value(&["Property type/built form", "built_form"]), + duration: get_enum_value(&["Leashold/Freehold", "duration"]), + current_energy_rating: get_enum_value(&["Current energy rating", "current_energy_rating"]), + potential_energy_rating: get_enum_value(&["Potential energy rating", "potential_energy_rating"]), + lat: state.data.lat[row], + lon: state.data.lon[row], + features, + } + }) + .collect(); + + let elapsed = t0.elapsed(); + info!( + h3 = %h3_str, + resolution, + total, + returned = properties.len(), + offset, + filters = num_filters, + filters_raw = filters_str.as_deref().unwrap_or("-"), + ms = format_args!("{:.1}", elapsed.as_secs_f64() * 1000.0), + "GET /api/hexagon-properties" + ); + + HexagonPropertiesResponse { + properties, + total, + limit, + offset, + truncated, + } + }) + .await + .unwrap(); + + Ok(Json(result)) +} diff --git a/server-rs/src/state.rs b/server-rs/src/state.rs new file mode 100644 index 0000000..2949d9b --- /dev/null +++ b/server-rs/src/state.rs @@ -0,0 +1,12 @@ +use crate::data::{POIData, PropertyData}; +use crate::index::GridIndex; + +pub struct AppState { + pub data: PropertyData, + pub grid: GridIndex, + /// h3_cells[resolution][row_idx] = precomputed H3 cell ID. + /// Empty Vec for resolutions not precomputed. + pub h3_cells: Vec>, + pub poi_data: POIData, + pub poi_grid: GridIndex, +}