perfect-postcode/server-rs/src/routes/hexagon_stats.rs
2026-05-17 10:16:30 +01:00

652 lines
22 KiB
Rust

use std::collections::{HashMap, HashSet};
use std::str::FromStr;
use std::sync::Arc;
use axum::extract::{Query, State};
use axum::http::StatusCode;
use axum::response::{IntoResponse, Json};
use axum::Extension;
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use tracing::{info, warn};
use crate::auth::OptionalUser;
use crate::consts::NAN_U16;
use crate::data::travel_time::TravelData;
use crate::data::PropertyData;
use crate::features::{Feature, FEATURE_GROUPS};
use crate::licensing::{check_license_bounds, resolve_share_code};
use crate::parsing::{
cell_for_row_cached, h3_cell_bounds, needs_parent, parse_field_set, parse_filters_with_poi,
row_passes_filters, row_passes_poi_filters, validate_h3_resolution, ParsedEnumFilter,
ParsedFilter, ParsedPoiFilter,
};
use crate::state::SharedState;
use super::stats;
use super::travel_time::{
load_travel_data, parse_optional_travel, row_passes_travel_filters, TravelEntry,
};
const AREA_STATS_EXCLUDED_GROUPS: &[&str] = &["Amenities"];
const MAX_FILTER_EXCLUSIONS: usize = 5;
#[derive(Serialize)]
pub struct HistogramStats {
pub min: f64,
pub max: f64,
/// 1st percentile (left edge of main distribution)
pub p1: f64,
/// 99th percentile (right edge of main distribution)
pub p99: f64,
pub counts: Vec<u64>,
}
#[derive(Serialize)]
pub struct NumericFeatureStats {
pub name: String,
pub count: usize,
pub min: f64,
pub max: f64,
pub mean: f64,
pub histogram: HistogramStats,
}
#[derive(Serialize)]
pub struct EnumFeatureStats {
pub name: String,
pub counts: HashMap<String, u64>,
}
#[derive(Serialize)]
pub struct PricePoint {
pub year: f32,
pub price: f32,
}
#[derive(Serialize)]
pub struct FilterExclusion {
pub name: String,
pub kind: String,
pub direction: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub value: Option<f32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub min: Option<f32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub max: Option<f32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub category: Option<String>,
pub relative_difference: f32,
pub rejected_count: usize,
}
fn filter_exclusion_key(exclusion: &FilterExclusion) -> String {
format!(
"{}\u{1f}{}\u{1f}{}\u{1f}{}",
exclusion.kind,
exclusion.name,
exclusion.direction,
exclusion.category.as_deref().unwrap_or("")
)
}
fn missing_filter_exclusion(name: String, kind: &str) -> FilterExclusion {
FilterExclusion {
name,
kind: kind.to_string(),
direction: "missing_value".to_string(),
value: None,
min: None,
max: None,
category: None,
relative_difference: 1.0,
rejected_count: 0,
}
}
#[derive(Serialize)]
pub struct HexagonStatsResponse {
pub count: usize,
pub numeric_features: Vec<NumericFeatureStats>,
pub enum_features: Vec<EnumFeatureStats>,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub price_history: Vec<PricePoint>,
#[serde(skip_serializing_if = "Option::is_none")]
pub central_postcode: Option<String>,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub filter_exclusions: Vec<FilterExclusion>,
}
#[derive(Deserialize)]
pub struct HexagonStatsParams {
pub h3: String,
pub resolution: u8,
pub filters: Option<String>,
/// `;;`-separated feature names to include in stats response.
/// Only listed features are computed. If absent, area stats default to
/// displayable groups; if empty, no feature stats are returned.
pub fields: Option<String>,
/// When set (with journey_slug), pick central_postcode as the postcode with the
/// shortest travel time for this mode+slug (so it has journey data).
pub journey_mode: Option<String>,
pub journey_slug: Option<String>,
/// Pipe-separated travel time entries: `mode:slug|mode:slug:min:max`.
/// Optional min:max applies as a filter (exclude properties outside range).
pub travel: Option<String>,
/// Share-link code; grants bbox-scoped access for unlicensed users.
pub share: Option<String>,
}
fn default_area_stat_field_set() -> HashSet<String> {
FEATURE_GROUPS
.iter()
.filter(|group| !AREA_STATS_EXCLUDED_GROUPS.contains(&group.name))
.flat_map(|group| group.features.iter())
.map(|feature| match feature {
Feature::Numeric(config) => config.name.to_string(),
Feature::Enum(config) => config.name.to_string(),
})
.collect()
}
pub(super) fn parse_area_stats_field_set(fields: Option<&str>) -> (bool, HashSet<String>) {
let (fields_specified, field_set) = parse_field_set(fields);
if fields_specified {
return (fields_specified, field_set);
}
(true, default_area_stat_field_set())
}
#[inline]
fn relative_difference(value: f32, min: f32, max: f32) -> Option<(String, f32)> {
let distance = if value < min {
min - value
} else if value > max {
value - max
} else {
return None;
};
let range = (max - min).abs();
let denominator = if range.is_finite() && range > f32::EPSILON {
range
} else {
min.abs().max(max.abs()).max(1.0)
};
let direction = if value < min {
"lower_min".to_string()
} else {
"raise_max".to_string()
};
Some((direction, distance / denominator))
}
pub(super) fn top_filter_exclusions(
area_rows: &[usize],
numeric_filters: &[ParsedFilter],
enum_filters: &[ParsedEnumFilter],
poi_filters: &[ParsedPoiFilter],
travel_entries: &[TravelEntry],
travel_data: &[TravelData],
data: &PropertyData,
) -> Vec<FilterExclusion> {
if area_rows.is_empty()
|| (numeric_filters.is_empty()
&& enum_filters.is_empty()
&& poi_filters.is_empty()
&& !travel_entries
.iter()
.any(|entry| entry.filter_min.is_some() && entry.filter_max.is_some()))
{
return Vec::new();
}
let feature_data = &data.feature_data;
let num_features = data.num_features;
let quant = data.quant_ref();
let poi_quant = data.poi_metrics.quant_ref();
let mut rejection_counts: HashMap<String, usize> = HashMap::new();
let mut best_path: Option<Vec<FilterExclusion>> = None;
for &row in area_rows {
let mut path = Vec::new();
for filter in numeric_filters {
let min = quant.decode(filter.feat_idx, filter.min_u16);
let max = quant.decode(filter.feat_idx, filter.max_u16);
let raw = feature_data[row * num_features + filter.feat_idx];
if raw == NAN_U16 {
path.push(missing_filter_exclusion(
data.feature_names[filter.feat_idx].clone(),
"numeric",
));
continue;
}
let value = quant.decode(filter.feat_idx, raw);
let Some((direction, rel_diff)) = relative_difference(value, min, max) else {
continue;
};
path.push(FilterExclusion {
name: data.feature_names[filter.feat_idx].clone(),
kind: "numeric".to_string(),
direction,
value: Some(value),
min: Some(min),
max: Some(max),
category: None,
relative_difference: rel_diff,
rejected_count: 0,
});
}
for filter in enum_filters {
let raw = feature_data[row * num_features + filter.feat_idx];
if raw == NAN_U16 {
path.push(missing_filter_exclusion(
data.feature_names[filter.feat_idx].clone(),
"enum",
));
continue;
}
if filter.allowed.contains(&raw) {
continue;
}
let Some(values) = data.enum_values.get(&filter.feat_idx) else {
continue;
};
let Some(category) = values.get(raw as usize) else {
continue;
};
path.push(FilterExclusion {
name: data.feature_names[filter.feat_idx].clone(),
kind: "enum".to_string(),
direction: "allow_value".to_string(),
value: None,
min: None,
max: None,
category: Some(category.clone()),
relative_difference: 1.0,
rejected_count: 0,
});
}
for filter in poi_filters {
let min = poi_quant.decode(filter.metric_idx, filter.min_u16);
let max = poi_quant.decode(filter.metric_idx, filter.max_u16);
let raw = data
.poi_metrics
.raw_for_property_row(row, filter.metric_idx);
if raw == NAN_U16 {
path.push(missing_filter_exclusion(
data.poi_metrics.feature_names[filter.metric_idx].clone(),
"poi",
));
continue;
}
let value = poi_quant.decode(filter.metric_idx, raw);
let Some((direction, rel_diff)) = relative_difference(value, min, max) else {
continue;
};
path.push(FilterExclusion {
name: data.poi_metrics.feature_names[filter.metric_idx].clone(),
kind: "poi".to_string(),
direction,
value: Some(value),
min: Some(min),
max: Some(max),
category: None,
relative_difference: rel_diff,
rejected_count: 0,
});
}
for (filter_index, entry) in travel_entries.iter().enumerate() {
let (Some(min), Some(max)) = (entry.filter_min, entry.filter_max) else {
continue;
};
let postcode = data.postcode(row);
let Some(row_data) = travel_data
.get(filter_index)
.and_then(|travel| travel.get(postcode))
else {
path.push(missing_filter_exclusion(
format!("tt_{}_{}", entry.mode, entry.slug),
"travel",
));
continue;
};
let minutes = if entry.use_best {
row_data.best_minutes.unwrap_or(row_data.minutes)
} else {
row_data.minutes
} as f32;
let Some((direction, rel_diff)) = relative_difference(minutes, min, max) else {
continue;
};
path.push(FilterExclusion {
name: format!("tt_{}_{}", entry.mode, entry.slug),
kind: "travel".to_string(),
direction,
value: Some(minutes),
min: Some(min),
max: Some(max),
category: None,
relative_difference: rel_diff,
rejected_count: 0,
});
}
if path.is_empty() {
continue;
}
for exclusion in &path {
*rejection_counts
.entry(filter_exclusion_key(exclusion))
.or_default() += 1;
}
let path_score = path
.iter()
.map(|exclusion| exclusion.relative_difference)
.sum::<f32>();
let current_score = best_path
.as_ref()
.map(|current| {
current
.iter()
.map(|exclusion| exclusion.relative_difference)
.sum::<f32>()
})
.unwrap_or(f32::INFINITY);
let replace = path_score < current_score
|| (path_score == current_score
&& best_path
.as_ref()
.map_or(true, |current| path.len() < current.len()));
if replace {
best_path = Some(path);
}
}
let Some(mut exclusions) = best_path else {
return Vec::new();
};
for exclusion in &mut exclusions {
exclusion.rejected_count = rejection_counts
.get(&filter_exclusion_key(exclusion))
.copied()
.unwrap_or(0);
}
exclusions.sort_by(|a, b| {
a.relative_difference
.partial_cmp(&b.relative_difference)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| b.rejected_count.cmp(&a.rejected_count))
.then_with(|| a.name.cmp(&b.name))
});
exclusions.truncate(MAX_FILTER_EXCLUSIONS);
exclusions
}
pub async fn get_hexagon_stats(
State(shared): State<Arc<SharedState>>,
Extension(user): Extension<OptionalUser>,
Query(params): Query<HexagonStatsParams>,
) -> Result<Json<HexagonStatsResponse>, axum::response::Response> {
let state = shared.load_state();
let cell = h3o::CellIndex::from_str(&params.h3).map_err(|error| {
warn!(h3 = %params.h3, error = %error, "Invalid H3 cell index");
(
StatusCode::BAD_REQUEST,
format!("Invalid H3 cell: {}", error),
)
.into_response()
})?;
let cell_u64: u64 = cell.into();
let resolution = params.resolution;
validate_h3_resolution(resolution).map_err(IntoResponse::into_response)?;
// License check using H3 cell bounds
let h3_bounds = h3_cell_bounds(cell, 0.0);
let share_bounds = resolve_share_code(&state, params.share.as_deref()).await;
check_license_bounds(&user.0, h3_bounds, share_bounds)?;
let h3_str = params.h3;
let quant = state.data.quant_ref();
let poi_quant = state.data.poi_metrics.quant_ref();
let (parsed_filters, parsed_enum_filters, parsed_poi_filters) = parse_filters_with_poi(
params.filters.as_deref(),
&state.feature_name_to_index,
&state.data.enum_values,
&quant,
&state.data.poi_metrics.name_to_index,
&poi_quant,
)
.map_err(|err| (StatusCode::BAD_REQUEST, err).into_response())?;
let num_filters = parsed_filters.len() + parsed_enum_filters.len() + parsed_poi_filters.len();
let filters_str = params.filters;
let has_poi_filters = !parsed_poi_filters.is_empty();
let (fields_specified, field_set) = parse_area_stats_field_set(params.fields.as_deref());
let travel_entries = parse_optional_travel(params.travel.as_deref())
.map_err(|err| (StatusCode::BAD_REQUEST, err).into_response())?;
// Load travel time data for central_postcode selection (if requested)
let journey_travel_data = match (&params.journey_mode, &params.journey_slug) {
(Some(mode), Some(slug)) if state.travel_time_store.has_destination(mode, slug) => {
state.travel_time_store.get(mode, slug).ok()
}
_ => None,
};
let response = tokio::task::spawn_blocking(move || {
let start_time = std::time::Instant::now();
let precomputed = &state.h3_cells;
let h3_res = h3o::Resolution::try_from(resolution)
.map_err(|err| format!("Invalid H3 resolution {}: {}", resolution, err))?;
let need_parent = needs_parent(resolution);
let num_features = state.data.num_features;
let feature_data = &state.data.feature_data;
let travel_data = load_travel_data(&state.travel_time_store, &travel_entries)?;
let has_travel = !travel_entries.is_empty();
let (min_lat, min_lon, max_lat, max_lon) = h3_cell_bounds(cell, 0.001);
let mut h3_cache: FxHashMap<u64, u64> = FxHashMap::default();
let mut area_rows: Vec<usize> = Vec::new();
let mut matching_rows: Vec<usize> = Vec::new();
state
.grid
.for_each_in_bounds(min_lat, min_lon, max_lat, max_lon, |row_idx| {
let row = row_idx as usize;
if cell_for_row_cached(row, precomputed, h3_res, need_parent, &mut h3_cache)
!= cell_u64
{
return;
}
area_rows.push(row);
if row_passes_filters(
row,
&parsed_filters,
&parsed_enum_filters,
feature_data,
num_features,
) && (!has_poi_filters
|| row_passes_poi_filters(row, &parsed_poi_filters, &state.data.poi_metrics))
{
if has_travel
&& !row_passes_travel_filters(
state.data.postcode(row),
&travel_entries,
&travel_data,
)
{
return;
}
matching_rows.push(row);
}
});
let total_count = matching_rows.len();
let filter_exclusions = if total_count == 0 {
top_filter_exclusions(
&area_rows,
&parsed_filters,
&parsed_enum_filters,
&parsed_poi_filters,
&travel_entries,
&travel_data,
&state.data,
)
} else {
Vec::new()
};
// Pick central_postcode: prefer the postcode with the shortest travel time
// for the requested journey destination (so it has journey data). Fall back
// to geographic proximity to the hexagon center.
let central_postcode = if !matching_rows.is_empty() {
if let Some(ref travel_data) = journey_travel_data {
// Find the row with the shortest travel time in the travel data
let best_row = matching_rows
.iter()
.copied()
.filter_map(|row| {
let pc = state.data.postcode(row);
travel_data.get(pc).map(|td| (row, td.minutes))
})
.min_by_key(|&(_, mins)| mins)
.map(|(row, _)| row);
// Fall back to geographic center if no row has travel data
let row = best_row.unwrap_or_else(|| {
let center: h3o::LatLng = cell.into();
let center_lat = center.lat() as f32;
let center_lon = center.lng() as f32;
matching_rows
.iter()
.copied()
.min_by(|&a, &b| {
let da = (state.data.lat[a] - center_lat).powi(2)
+ (state.data.lon[a] - center_lon).powi(2);
let db = (state.data.lat[b] - center_lat).powi(2)
+ (state.data.lon[b] - center_lon).powi(2);
da.partial_cmp(&db).unwrap_or(std::cmp::Ordering::Equal)
})
.expect("matching_rows is non-empty")
});
Some(state.data.postcode(row).to_string())
} else {
// No journey destination requested — use geographic center
let center: h3o::LatLng = cell.into();
let center_lat = center.lat() as f32;
let center_lon = center.lng() as f32;
let closest_row = matching_rows
.iter()
.copied()
.min_by(|&a, &b| {
let da = (state.data.lat[a] - center_lat).powi(2)
+ (state.data.lon[a] - center_lon).powi(2);
let db = (state.data.lat[b] - center_lat).powi(2)
+ (state.data.lon[b] - center_lon).powi(2);
da.partial_cmp(&db).unwrap_or(std::cmp::Ordering::Equal)
})
.expect("matching_rows is non-empty");
Some(state.data.postcode(closest_row).to_string())
}
} else {
None
};
let price_history =
stats::extract_price_history(&matching_rows, &state.data, &state.feature_name_to_index);
let (mut numeric_features, enum_features_out) = stats::compute_feature_stats(
&matching_rows,
&state.data,
&state.data.feature_names,
&state.data.enum_values,
&state.data.feature_stats,
fields_specified,
&field_set,
);
numeric_features.extend(stats::compute_poi_feature_stats(
&matching_rows,
&state.data.poi_metrics,
fields_specified,
&field_set,
));
let elapsed = start_time.elapsed();
info!(
h3 = %h3_str,
resolution,
total_count,
filters = num_filters,
filters_raw = filters_str.as_deref().unwrap_or("-"),
travel_entries = travel_entries.len(),
ms = format_args!("{:.1}", elapsed.as_secs_f64() * 1000.0),
"GET /api/hexagon-stats"
);
Ok(HexagonStatsResponse {
count: total_count,
numeric_features,
enum_features: enum_features_out,
price_history,
central_postcode,
filter_exclusions,
})
})
.await
.map_err(|error| (StatusCode::INTERNAL_SERVER_ERROR, error.to_string()).into_response())?
.map_err(|error: String| (StatusCode::INTERNAL_SERVER_ERROR, error).into_response())?;
Ok(Json(response))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn default_area_stat_fields_skip_amenities() {
let (fields_specified, field_set) = parse_area_stats_field_set(None);
assert!(fields_specified);
assert!(field_set.contains("Property type"));
assert!(field_set.contains("Street tree density percentile"));
assert!(field_set.contains("Noise (dB)"));
assert!(!field_set.contains("Max available download speed (Mbps)"));
assert!(!field_set.contains("Distance to nearest amenity (Cafe) (km)"));
}
#[test]
fn explicit_area_stat_fields_are_respected() {
let (fields_specified, field_set) =
parse_area_stats_field_set(Some("Noise (dB);;Property type"));
assert!(fields_specified);
assert!(field_set.contains("Noise (dB)"));
assert!(field_set.contains("Property type"));
assert_eq!(field_set.len(), 2);
}
}