Refactor and improve

This commit is contained in:
Andras Schmelczer 2026-02-03 20:26:57 +00:00
parent 1f148b2185
commit 242acff987
22 changed files with 754 additions and 1053 deletions

View file

@ -1,18 +1,50 @@
use std::fmt::Write;
use std::collections::HashMap;
use std::str::FromStr;
use std::sync::Arc;
use axum::extract::Query;
use axum::http::StatusCode;
use axum::response::IntoResponse;
use serde::Deserialize;
use axum::response::Json;
use serde::{Deserialize, Serialize};
use tracing::{info, warn};
use crate::consts::{ENUM_NULL, H3_PRECOMPUTE_MAX, H3_REQUEST_MAX, H3_REQUEST_MIN, HISTOGRAM_BINS};
use crate::filter::{parse_filters, row_passes_filters};
use crate::consts::{H3_PRECOMPUTE_MAX, H3_REQUEST_MAX, H3_REQUEST_MIN, HISTOGRAM_BINS};
use crate::parsing::{h3_cell_bounds, parse_filters, row_passes_filters};
use crate::state::AppState;
use super::parse::h3_cell_bounds;
#[derive(Serialize)]
pub struct HistogramStats {
min: f64,
max: f64,
/// 1st percentile (left edge of main distribution)
p1: f64,
/// 99th percentile (right edge of main distribution)
p99: f64,
counts: Vec<u64>,
}
#[derive(Serialize)]
pub struct NumericFeatureStats {
name: String,
count: usize,
min: f64,
max: f64,
mean: f64,
histogram: HistogramStats,
}
#[derive(Serialize)]
pub struct EnumFeatureStats {
name: String,
counts: HashMap<String, u64>,
}
#[derive(Serialize)]
pub struct HexagonStatsResponse {
count: usize,
numeric_features: Vec<NumericFeatureStats>,
enum_features: Vec<EnumFeatureStats>,
}
#[derive(Deserialize)]
pub struct HexagonStatsParams {
@ -20,15 +52,14 @@ pub struct HexagonStatsParams {
pub resolution: u8,
pub filters: Option<String>,
/// Comma-separated feature names to include in stats response.
/// When present (even if empty), only listed features are computed.
/// When absent, all features are returned (backward compatible).
/// Only listed features are computed; if absent or empty, no features are returned.
pub fields: Option<String>,
}
pub async fn get_hexagon_stats(
state: Arc<AppState>,
Query(params): Query<HexagonStatsParams>,
) -> Result<impl IntoResponse, (StatusCode, String)> {
) -> Result<Json<HexagonStatsResponse>, (StatusCode, String)> {
let cell = h3o::CellIndex::from_str(&params.h3).map_err(|error| {
warn!(h3 = %params.h3, error = %error, "Invalid H3 cell index");
(
@ -57,36 +88,34 @@ pub async fn get_hexagon_stats(
let (parsed_filters, parsed_enum_filters) = parse_filters(
params.filters.as_deref(),
&state.data.feature_names,
&state.data.enum_features,
&state.data.enum_values,
);
let num_filters = parsed_filters.len() + parsed_enum_filters.len();
// Parse optional `fields` param into sets of feature names.
// None = include all, Some = only include listed features.
let field_set: Option<std::collections::HashSet<String>> =
params.fields.as_ref().map(|fields_str| {
let fields_specified = params.fields.is_some();
let field_set: std::collections::HashSet<String> = params
.fields
.as_ref()
.map(|fields_str| {
fields_str
.split(',')
.map(|field| field.trim().to_string())
.filter(|field| !field.is_empty())
.collect()
});
})
.unwrap_or_default();
let result = tokio::task::spawn_blocking(move || {
let response = tokio::task::spawn_blocking(move || {
let start_time = std::time::Instant::now();
let precomputed = &state.h3_cells;
let h3_res = h3o::Resolution::try_from(resolution)
.map_err(|err| format!("Invalid H3 resolution {}: {}", resolution, err))?;
let need_parent = resolution < H3_PRECOMPUTE_MAX;
let num_features = state.data.num_features;
let num_enums = state.data.num_enums;
let feature_data = &state.data.feature_data;
let enum_data = &state.data.enum_data;
let enum_features = &state.data.enum_features;
let (min_lat, min_lon, max_lat, max_lon) = h3_cell_bounds(cell, 0.001);
// Resolve cell at requested resolution from precomputed max-resolution cell
let cell_for_row = |row: usize| -> u64 {
let max_cell = precomputed[row];
if !need_parent || max_cell == 0 {
@ -99,7 +128,6 @@ pub async fn get_hexagon_stats(
.unwrap_or(0)
};
// Collect matching rows
let mut matching_rows: Vec<usize> = Vec::new();
state
.grid
@ -112,8 +140,6 @@ pub async fn get_hexagon_stats(
&parsed_enum_filters,
feature_data,
num_features,
enum_data,
num_enums,
)
{
matching_rows.push(row);
@ -122,140 +148,108 @@ pub async fn get_hexagon_stats(
let total_count = matching_rows.len();
// Build JSON directly via string buffer
let mut output = String::with_capacity(4096);
output.push_str("{\"count\":");
write!(output, "{}", total_count).unwrap();
let mut numeric_features = Vec::new();
let mut enum_features_out = Vec::new();
// Numeric features: compute count, min, max, sum, histogram using global bin edges
output.push_str(",\"numeric_features\":[");
let mut first_numeric = true;
for (feature_index, feature_name) in state.data.feature_names.iter().enumerate() {
// Skip features not in the requested set (when fields param is present)
if let Some(ref set) = field_set {
if !set.contains(feature_name.as_str()) {
continue;
}
}
let global_stats = &state.data.feature_stats[feature_index];
let histogram_min = global_stats.histogram.min;
let histogram_max = global_stats.histogram.max;
let bin_width = global_stats.histogram.bin_width;
let mut count = 0usize;
let mut min_value = f32::INFINITY;
let mut max_value = f32::NEG_INFINITY;
let mut sum = 0.0f64; // keep f64 for mean precision
let mut bins = vec![0u64; HISTOGRAM_BINS];
for &row in &matching_rows {
let value = feature_data[row * num_features + feature_index];
if value.is_finite() {
count += 1;
if value < min_value {
min_value = value;
}
if value > max_value {
max_value = value;
}
sum += value as f64;
// Bin into histogram using global edges (cast to f64 for bin index math)
if bin_width > 0.0 {
let bin_index = ((value as f64 - histogram_min as f64) / bin_width as f64)
.floor() as isize;
let clamped_index =
bin_index.max(0).min((HISTOGRAM_BINS - 1) as isize) as usize;
bins[clamped_index] += 1;
}
}
}
if count == 0 {
if fields_specified && !field_set.contains(feature_name.as_str()) {
continue;
}
if !first_numeric {
output.push(',');
}
first_numeric = false;
let mean = sum / count as f64;
output.push_str("{\"name\":");
write_json_string(&mut output, feature_name);
write!(output, ",\"count\":{}", count).unwrap();
write!(output, ",\"min\":{}", format_num(min_value)).unwrap();
write!(output, ",\"max\":{}", format_num(max_value)).unwrap();
write!(output, ",\"mean\":{}", format_f64(mean)).unwrap();
output.push_str(",\"histogram\":{\"min\":");
write!(output, "{}", format_num(histogram_min)).unwrap();
output.push_str(",\"max\":");
write!(output, "{}", format_num(histogram_max)).unwrap();
output.push_str(",\"bin_width\":");
write!(output, "{}", format_num(bin_width)).unwrap();
output.push_str(",\"counts\":[");
for (bin_index, &bin_count) in bins.iter().enumerate() {
if bin_index > 0 {
output.push(',');
// Check if this is an enum feature
if let Some(enum_values) = state.data.enum_values.get(&feature_index) {
// Enum feature: count occurrences of each value
let mut value_counts = vec![0u64; enum_values.len()];
for &row in &matching_rows {
let value = feature_data[row * num_features + feature_index];
if value.is_finite() {
let idx = value as usize;
if idx < value_counts.len() {
value_counts[idx] += 1;
}
}
}
let counts: HashMap<String, u64> = value_counts
.iter()
.enumerate()
.filter(|(_, &count)| count > 0)
.map(|(idx, &count)| (enum_values[idx].clone(), count))
.collect();
if !counts.is_empty() {
enum_features_out.push(EnumFeatureStats {
name: feature_name.clone(),
counts,
});
}
} else {
// Numeric feature: compute stats and histogram
let global_hist = &state.data.feature_stats[feature_index].histogram;
let p1 = global_hist.p1;
let p99 = global_hist.p99;
let mut count = 0usize;
let mut min_value = f32::INFINITY;
let mut max_value = f32::NEG_INFINITY;
let mut sum = 0.0f64;
let mut bins = vec![0u64; HISTOGRAM_BINS];
// Compute middle bin width (between p1 and p99)
let middle_bins = HISTOGRAM_BINS.saturating_sub(2);
let middle_width = if middle_bins > 0 && p99 > p1 {
(p99 - p1) / middle_bins as f32
} else {
0.0
};
for &row in &matching_rows {
let value = feature_data[row * num_features + feature_index];
if value.is_finite() {
count += 1;
if value < min_value {
min_value = value;
}
if value > max_value {
max_value = value;
}
sum += value as f64;
// Bin using p1/p99 outlier structure
let bin = if value < p1 {
0 // Low outlier bin
} else if value >= p99 {
HISTOGRAM_BINS - 1 // High outlier bin
} else if middle_width > 0.0 {
// Middle bins (1 to n-2)
let middle_bin = ((value - p1) / middle_width) as usize;
(1 + middle_bin).min(HISTOGRAM_BINS - 2)
} else {
HISTOGRAM_BINS / 2 // Fallback if p1 == p99
};
bins[bin] += 1;
}
}
if count > 0 {
numeric_features.push(NumericFeatureStats {
name: feature_name.clone(),
count,
min: min_value as f64,
max: max_value as f64,
mean: sum / count as f64,
histogram: HistogramStats {
min: global_hist.min as f64,
max: global_hist.max as f64,
p1: p1 as f64,
p99: p99 as f64,
counts: bins,
},
});
}
write!(output, "{}", bin_count).unwrap();
}
output.push_str("]}}")
}
// Enum features: count per value
output.push_str("],\"enum_features\":[");
let mut first_enum = true;
for enum_feature in enum_features {
// Skip enum features not in the requested set
if let Some(ref set) = field_set {
if !set.contains(enum_feature.name.as_str()) {
continue;
}
}
let enum_index = match state.enum_name_to_idx.get(&enum_feature.name) {
Some(&index) => index,
None => continue,
};
let mut value_counts = vec![0u64; enum_feature.values.len()];
for &row in &matching_rows {
let value = enum_data[row * num_enums + enum_index];
if value != ENUM_NULL && (value as usize) < value_counts.len() {
value_counts[value as usize] += 1;
}
}
// Only include if there are any non-zero counts
let has_values = value_counts.iter().any(|&count| count > 0);
if !has_values {
continue;
}
if !first_enum {
output.push(',');
}
first_enum = false;
output.push_str("{\"name\":");
write_json_string(&mut output, &enum_feature.name);
output.push_str(",\"counts\":{");
let mut first_value = true;
for (value_index, &count) in value_counts.iter().enumerate() {
if count == 0 {
continue;
}
if !first_value {
output.push(',');
}
first_value = false;
write_json_string(&mut output, &enum_feature.values[value_index]);
write!(output, ":{}", count).unwrap();
}
output.push_str("}}");
}
output.push_str("]}");
let elapsed = start_time.elapsed();
info!(
h3 = %h3_str,
@ -267,46 +261,15 @@ pub async fn get_hexagon_stats(
"GET /api/hexagon-stats"
);
Ok(output)
Ok(HexagonStatsResponse {
count: total_count,
numeric_features,
enum_features: enum_features_out,
})
})
.await
.map_err(|error| (StatusCode::INTERNAL_SERVER_ERROR, error.to_string()))?
.map_err(|error| (StatusCode::INTERNAL_SERVER_ERROR, error))?;
.map_err(|error: String| (StatusCode::INTERNAL_SERVER_ERROR, error))?;
Ok((
[(axum::http::header::CONTENT_TYPE, "application/json")],
result,
))
}
fn write_json_string(output: &mut String, value: &str) {
output.push('"');
for character in value.chars() {
match character {
'"' => output.push_str("\\\""),
'\\' => output.push_str("\\\\"),
'\n' => output.push_str("\\n"),
'\r' => output.push_str("\\r"),
'\t' => output.push_str("\\t"),
other => output.push(other),
}
}
output.push('"');
}
fn format_num(value: f32) -> String {
let fv = value as f64;
if fv.fract() == 0.0 && fv.abs() < 1e15 {
format!("{:.1}", fv)
} else {
format!("{}", fv)
}
}
fn format_f64(value: f64) -> String {
if value.fract() == 0.0 && value.abs() < 1e15 {
format!("{:.1}", value)
} else {
format!("{}", value)
}
Ok(Json(response))
}