Checkpoint all changes

This commit is contained in:
Andras Schmelczer 2026-02-01 19:30:33 +00:00
parent 65877acf95
commit 66c2a25457
28 changed files with 3035 additions and 621 deletions

View file

@ -1,4 +1,4 @@
use std::fmt::Write;
use std::fmt::{self, Write};
use std::sync::Arc;
use axum::extract::Query;
@ -8,11 +8,29 @@ use rustc_hash::FxHashMap;
use serde::Deserialize;
use tracing::{info, warn};
use crate::consts::{H3_PRECOMPUTE_MAX, H3_PRECOMPUTE_MIN};
use crate::consts::{
BOUNDS_BUFFER_PERCENT, BOUNDS_QUANTIZATION, ENUM_NULL, H3_PRECOMPUTE_MAX, H3_PRECOMPUTE_MIN,
POSTCODE_MIN_RESOLUTION,
};
use crate::filter::parse_filters;
use crate::state::AppState;
const BOUNDS_BUFFER_PERCENT: f64 = 0.2;
use super::parse::parse_bounds;
struct HumanBytes(usize);
impl fmt::Display for HumanBytes {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
let bytes = self.0;
if bytes >= 1_000_000 {
write!(formatter, "{:.1} MB", bytes as f64 / 1_000_000.0)
} else if bytes >= 1_000 {
write!(formatter, "{:.1} KB", bytes as f64 / 1_000.0)
} else {
write!(formatter, "{} B", bytes)
}
}
}
#[derive(Deserialize)]
pub struct HexagonParams {
@ -28,14 +46,28 @@ struct CellAgg {
count: u32,
mins: Vec<f64>,
maxs: Vec<f64>,
/// Min/max ordinal indices for enum features (255 = no data yet)
enum_mins: Vec<u8>,
enum_maxs: Vec<u8>,
/// Most common postcode in this cell (only tracked at high resolutions)
postcode: Option<String>,
postcode_count: u32,
lat_sum: f64,
lon_sum: f64,
}
impl CellAgg {
fn new(num_features: usize) -> Self {
fn new(num_features: usize, num_enums: usize) -> Self {
CellAgg {
count: 0,
mins: vec![f64::INFINITY; num_features],
maxs: vec![f64::NEG_INFINITY; num_features],
enum_mins: vec![ENUM_NULL; num_enums],
enum_maxs: vec![0; num_enums],
postcode: None,
postcode_count: 0,
lat_sum: 0.0,
lon_sum: 0.0,
}
}
@ -47,49 +79,129 @@ impl CellAgg {
self.count += 1;
let base = row * num_features;
let row_slice = &feature_data[base..base + num_features];
for (i, &v) in row_slice.iter().enumerate() {
if v.is_finite() {
if v < self.mins[i] {
self.mins[i] = v;
for (feat_index, &value) in row_slice.iter().enumerate() {
if value.is_finite() {
if value < self.mins[feat_index] {
self.mins[feat_index] = value;
}
if v > self.maxs[i] {
self.maxs[i] = v;
if value > self.maxs[feat_index] {
self.maxs[feat_index] = value;
}
}
}
}
/// Track min/max ordinal index for each enum feature in this cell.
#[inline]
fn add_enums(&mut self, enum_features: &[crate::data::EnumFeatureData], row: usize) {
for (enum_index, enum_feature) in enum_features.iter().enumerate() {
let value = enum_feature.data[row];
if value != ENUM_NULL {
if self.enum_mins[enum_index] == ENUM_NULL || value < self.enum_mins[enum_index] {
self.enum_mins[enum_index] = value;
}
if value > self.enum_maxs[enum_index] {
self.enum_maxs[enum_index] = value;
}
}
}
}
/// Track postcode and centroid for high-resolution cells.
/// Uses simple "first seen" approach — at res 11/12, most rows in a cell share a postcode.
#[inline]
fn add_postcode(&mut self, postcode: &str, lat: f64, lon: f64) {
self.lat_sum += lat;
self.lon_sum += lon;
if postcode.is_empty() {
return;
}
if self.postcode.is_none() {
self.postcode = Some(postcode.to_string());
self.postcode_count = 1;
} else if self.postcode.as_deref() == Some(postcode) {
self.postcode_count += 1;
}
}
}
/// Escape a string for inclusion in a JSON string literal.
pub(crate) fn write_json_escaped(buf: &mut String, text: &str) {
for character in text.chars() {
match character {
'"' => buf.push_str("\\\""),
'\\' => buf.push_str("\\\\"),
'\n' => buf.push_str("\\n"),
'\r' => buf.push_str("\\r"),
'\t' => buf.push_str("\\t"),
ctrl if ctrl < '\x20' => { let _ = write!(buf, "\\u{:04x}", ctrl as u32); }
other => buf.push(other),
}
}
}
/// Write the hexagons JSON response directly to a String buffer,
/// avoiding serde_json::Value allocations entirely.
#[allow(clippy::too_many_arguments)]
fn write_hexagons_json(
buf: &mut String,
groups: &FxHashMap<u64, CellAgg>,
min_keys: &[String],
max_keys: &[String],
num_features: usize,
enum_min_keys: &[String],
enum_max_keys: &[String],
num_enums: usize,
include_postcode: bool,
) {
buf.push_str("{\"features\":[");
let mut first = true;
for (&cell_id, agg) in groups {
for (&cell_id, aggregation) in groups {
let Some(cell) = h3o::CellIndex::try_from(cell_id).ok() else {
continue;
};
if !first {
buf.push(',');
}
first = false;
let cell = h3o::CellIndex::try_from(cell_id).unwrap();
write!(buf, "{{\"h3\":\"{}\",\"count\":{}", cell, agg.count).unwrap();
let _ = write!(buf, "{{\"h3\":\"{}\",\"count\":{}", cell, aggregation.count);
for i in 0..num_features {
if agg.mins[i] != f64::INFINITY {
write!(
for feat_index in 0..num_features {
if aggregation.mins[feat_index].is_finite() && aggregation.maxs[feat_index].is_finite() {
let _ = write!(
buf,
",\"{}\":{},\"{}\":{}",
min_keys[i], agg.mins[i], max_keys[i], agg.maxs[i]
)
.unwrap();
min_keys[feat_index], aggregation.mins[feat_index], max_keys[feat_index], aggregation.maxs[feat_index]
);
}
}
for enum_index in 0..num_enums {
if aggregation.enum_mins[enum_index] != ENUM_NULL {
let _ = write!(
buf,
",\"{}\":{},\"{}\":{}",
enum_min_keys[enum_index], aggregation.enum_mins[enum_index],
enum_max_keys[enum_index], aggregation.enum_maxs[enum_index]
);
}
}
if include_postcode {
if let Some(ref postcode) = aggregation.postcode {
let total = aggregation.count as f64;
let centroid_lat = aggregation.lat_sum / total;
let centroid_lon = aggregation.lon_sum / total;
if centroid_lat.is_finite() && centroid_lon.is_finite() {
buf.push_str(",\"postcode\":\"");
write_json_escaped(buf, postcode);
let _ = write!(buf, "\",\"lat\":{},\"lon\":{}", centroid_lat, centroid_lon);
}
}
}
buf.push('}');
}
buf.push_str("]}");
@ -101,7 +213,10 @@ pub async fn get_hexagons(
) -> Result<impl IntoResponse, (StatusCode, String)> {
let resolution = params.resolution;
if resolution < H3_PRECOMPUTE_MIN || resolution > H3_PRECOMPUTE_MAX {
warn!(resolution, "Resolution out of range [{}, {}]", H3_PRECOMPUTE_MIN, H3_PRECOMPUTE_MAX);
warn!(
resolution,
"Resolution out of range [{}, {}]", H3_PRECOMPUTE_MIN, H3_PRECOMPUTE_MAX
);
return Err((
StatusCode::BAD_REQUEST,
format!(
@ -116,25 +231,7 @@ pub async fn get_hexagons(
"bounds parameter is required".into(),
))?;
let parts: Vec<f64> = bounds_str
.split(',')
.map(|s| s.trim().parse::<f64>())
.collect::<Result<Vec<_>, _>>()
.map_err(|_| {
(
StatusCode::BAD_REQUEST,
"Invalid bounds format. Use: south,west,north,east".into(),
)
})?;
if parts.len() != 4 {
return Err((
StatusCode::BAD_REQUEST,
"Invalid bounds format. Use: south,west,north,east".into(),
));
}
let (mut south, mut west, mut north, mut east) = (parts[0], parts[1], parts[2], parts[3]);
let (mut south, mut west, mut north, mut east) = parse_bounds(&bounds_str)?;
let lat_range = north - south;
let lng_range = east - west;
@ -143,11 +240,10 @@ pub async fn get_hexagons(
west -= lng_range * BOUNDS_BUFFER_PERCENT;
east += lng_range * BOUNDS_BUFFER_PERCENT;
let precision = 0.01;
south = (south / precision).floor() * precision;
west = (west / precision).floor() * precision;
north = (north / precision).ceil() * precision;
east = (east / precision).ceil() * precision;
south = (south / BOUNDS_QUANTIZATION).floor() * BOUNDS_QUANTIZATION;
west = (west / BOUNDS_QUANTIZATION).floor() * BOUNDS_QUANTIZATION;
north = (north / BOUNDS_QUANTIZATION).ceil() * BOUNDS_QUANTIZATION;
east = (east / BOUNDS_QUANTIZATION).ceil() * BOUNDS_QUANTIZATION;
let filters_str = params.filters.clone();
let (parsed_filters, parsed_enum_filters) = parse_filters(
@ -157,44 +253,38 @@ pub async fn get_hexagons(
);
let num_filters = parsed_filters.len() + parsed_enum_filters.len();
let json_body = tokio::task::spawn_blocking(move || {
let json_body = tokio::task::spawn_blocking(move || -> Result<String, String> {
let t0 = std::time::Instant::now();
let num_features = state.data.num_features;
let num_enums = state.data.enum_features.len();
let feature_data = &state.data.feature_data;
let min_keys: Vec<String> = state
.data
.feature_names
.iter()
.map(|n| format!("min_{}", n))
.collect();
let max_keys: Vec<String> = state
.data
.feature_names
.iter()
.map(|n| format!("max_{}", n))
.collect();
let min_keys = &state.min_keys;
let max_keys = &state.max_keys;
let enum_min_keys = &state.enum_min_keys;
let enum_max_keys = &state.enum_max_keys;
let h3_cells_for_res: Option<&[u64]> = state
.h3_cells
.get(resolution as usize)
.filter(|v| !v.is_empty())
.map(|v| v.as_slice());
.filter(|cells| !cells.is_empty())
.map(|cells| cells.as_slice());
let mut groups: FxHashMap<u64, CellAgg> = FxHashMap::default();
let enum_features = &state.data.enum_features;
let include_postcode = resolution >= POSTCODE_MIN_RESOLUTION;
// Row-level filter check: numeric must be non-NaN and within [min, max],
// enum must have value index in the allowed set
let row_passes = |row: usize| -> bool {
parsed_filters.iter().all(|f| {
let v = feature_data[row * num_features + f.feat_idx];
v.is_finite() && v >= f.min && v <= f.max
}) && parsed_enum_filters.iter().all(|ef| {
let v = enum_features[ef.enum_idx].data[row];
v != 255 && ef.allowed.contains(&v)
parsed_filters.iter().all(|filter| {
let value = feature_data[row * num_features + filter.feat_idx];
value.is_finite() && value >= filter.min && value <= filter.max
}) && parsed_enum_filters.iter().all(|enum_filter| {
let value = enum_features[enum_filter.enum_idx].data[row];
value != ENUM_NULL && enum_filter.allowed.contains(&value)
})
};
@ -207,13 +297,22 @@ pub async fn get_hexagons(
return;
}
let cell_id = precomputed[row];
groups
let aggregation = groups
.entry(cell_id)
.or_insert_with(|| CellAgg::new(num_features))
.add_row(feature_data, row, num_features);
.or_insert_with(|| CellAgg::new(num_features, num_enums));
aggregation.add_row(feature_data, row, num_features);
aggregation.add_enums(enum_features, row);
if include_postcode {
aggregation.add_postcode(
&state.data.postcode[row],
state.data.lat[row],
state.data.lon[row],
);
}
});
} else {
let h3_res = h3o::Resolution::try_from(resolution).unwrap();
let h3_res = h3o::Resolution::try_from(resolution)
.map_err(|error| format!("Invalid H3 resolution {}: {}", resolution, error))?;
state
.grid
.for_each_in_bounds(south, west, north, east, |row_idx| {
@ -222,19 +321,37 @@ pub async fn get_hexagons(
return;
}
let cell_id = h3o::LatLng::new(state.data.lat[row], state.data.lon[row])
.map(|c| u64::from(c.to_cell(h3_res)))
.map(|coord| u64::from(coord.to_cell(h3_res)))
.unwrap_or(0);
groups
let aggregation = groups
.entry(cell_id)
.or_insert_with(|| CellAgg::new(num_features))
.add_row(feature_data, row, num_features);
.or_insert_with(|| CellAgg::new(num_features, num_enums));
aggregation.add_row(feature_data, row, num_features);
aggregation.add_enums(enum_features, row);
if include_postcode {
aggregation.add_postcode(
&state.data.postcode[row],
state.data.lat[row],
state.data.lon[row],
);
}
});
}
let t_agg = t0.elapsed();
let mut json_buf = String::with_capacity(groups.len() * 128);
write_hexagons_json(&mut json_buf, &groups, &min_keys, &max_keys, num_features);
write_hexagons_json(
&mut json_buf,
&groups,
min_keys,
max_keys,
num_features,
enum_min_keys,
enum_max_keys,
num_enums,
include_postcode,
);
let t_total = t0.elapsed();
info!(
@ -244,14 +361,15 @@ pub async fn get_hexagons(
filters_raw = filters_str.as_deref().unwrap_or("-"),
agg_ms = format_args!("{:.1}", t_agg.as_secs_f64() * 1000.0),
total_ms = format_args!("{:.1}", t_total.as_secs_f64() * 1000.0),
bytes = json_buf.len(),
size = format_args!("{}", HumanBytes(json_buf.len())),
"GET /api/hexagons"
);
json_buf
Ok(json_buf)
})
.await
.unwrap();
.map_err(|error| (StatusCode::INTERNAL_SERVER_ERROR, error.to_string()))?
.map_err(|error| (StatusCode::INTERNAL_SERVER_ERROR, error))?;
Ok(([("content-type", "application/json")], json_body))
}