Refactor and other improvements
This commit is contained in:
parent
04a78e7bfe
commit
6c90cf3c0f
47 changed files with 2705 additions and 1568 deletions
|
|
@ -10,7 +10,7 @@ use rustc_hash::{FxHashMap, FxHashSet};
|
|||
use serde::Deserialize;
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::parsing::{parse_bounds, parse_filters, row_passes_filters};
|
||||
use crate::parsing::{parse_field_indices, parse_filters, require_bounds, row_passes_filters};
|
||||
use crate::routes::FeatureInfo;
|
||||
use crate::state::AppState;
|
||||
|
||||
|
|
@ -135,12 +135,7 @@ pub async fn get_export(
|
|||
state: Arc<AppState>,
|
||||
Query(params): Query<ExportParams>,
|
||||
) -> Result<impl IntoResponse, (StatusCode, String)> {
|
||||
let bounds_str = params.bounds.ok_or((
|
||||
StatusCode::BAD_REQUEST,
|
||||
"bounds parameter is required".into(),
|
||||
))?;
|
||||
|
||||
let (south, west, north, east) = parse_bounds(&bounds_str)?;
|
||||
let (south, west, north, east) = require_bounds(params.bounds)?;
|
||||
|
||||
let filters_str = params.filters.clone();
|
||||
let fields_str = params.fields.clone();
|
||||
|
|
@ -234,7 +229,10 @@ pub async fn get_export(
|
|||
let was_sampled = postcode_aggs.len() > MAX_EXPORT_POSTCODES;
|
||||
if was_sampled {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
bounds_str.hash(&mut hasher);
|
||||
south.to_bits().hash(&mut hasher);
|
||||
west.to_bits().hash(&mut hasher);
|
||||
north.to_bits().hash(&mut hasher);
|
||||
east.to_bits().hash(&mut hasher);
|
||||
let seed = hasher.finish();
|
||||
|
||||
let len = postcode_aggs.len();
|
||||
|
|
@ -251,20 +249,8 @@ pub async fn get_export(
|
|||
// Determine column order: filter features first, then remaining
|
||||
let filter_feature_names = extract_filter_feature_names(filters_str.as_deref());
|
||||
|
||||
let field_indices: Option<Vec<usize>> = fields_str.as_ref().map(|fs| {
|
||||
if fs.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
fs.split(',')
|
||||
.filter_map(|name| {
|
||||
let name = name.trim();
|
||||
if name.is_empty() {
|
||||
return None;
|
||||
}
|
||||
state.feature_name_to_index.get(name).copied()
|
||||
})
|
||||
.collect()
|
||||
});
|
||||
let field_indices =
|
||||
parse_field_indices(fields_str.as_deref(), &state.feature_name_to_index);
|
||||
|
||||
let all_feature_indices: Vec<usize> = if let Some(ref indices) = field_indices {
|
||||
indices.clone()
|
||||
|
|
@ -314,7 +300,7 @@ pub async fn get_export(
|
|||
.set_font_color("#666666")
|
||||
.set_align(FormatAlign::Left);
|
||||
|
||||
// Row 0: "View on Narrowit" link
|
||||
// Row 0: "View on Perfect Postcodes" link
|
||||
let mut dashboard_url = format!("{}/", public_url);
|
||||
let mut query_parts: Vec<String> = Vec::new();
|
||||
query_parts.push(format!("v={}", view_param));
|
||||
|
|
@ -329,7 +315,7 @@ pub async fn get_export(
|
|||
}
|
||||
|
||||
sheet
|
||||
.write_url(0, 0, Url::new(&dashboard_url).set_text("View on Narrowit"))
|
||||
.write_url(0, 0, Url::new(&dashboard_url).set_text("View on Perfect Postcodes"))
|
||||
.map_err(|err| format!("Failed to write URL: {err}"))?;
|
||||
sheet
|
||||
.set_row_format(0, &link_fmt)
|
||||
|
|
@ -499,7 +485,7 @@ pub async fn get_export(
|
|||
),
|
||||
(
|
||||
header::CONTENT_DISPOSITION,
|
||||
"attachment; filename=\"narrowit-export.xlsx\"",
|
||||
"attachment; filename=\"perfect-postcodes-export.xlsx\"",
|
||||
),
|
||||
],
|
||||
bytes,
|
||||
|
|
|
|||
|
|
@ -8,10 +8,14 @@ use axum::response::Json;
|
|||
use serde::{Deserialize, Serialize};
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::consts::{H3_PRECOMPUTE_MAX, H3_REQUEST_MAX, H3_REQUEST_MIN};
|
||||
use crate::parsing::{h3_cell_bounds, parse_filters, row_passes_filters};
|
||||
use crate::parsing::{
|
||||
cell_for_row, h3_cell_bounds, needs_parent, parse_field_set, parse_filters, row_passes_filters,
|
||||
validate_h3_resolution,
|
||||
};
|
||||
use crate::state::AppState;
|
||||
|
||||
use super::stats;
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct HistogramStats {
|
||||
pub min: f64,
|
||||
|
|
@ -78,19 +82,8 @@ pub async fn get_hexagon_stats(
|
|||
let cell_u64: u64 = cell.into();
|
||||
|
||||
let resolution = params.resolution;
|
||||
if !(H3_REQUEST_MIN..=H3_REQUEST_MAX).contains(&resolution) {
|
||||
warn!(
|
||||
resolution,
|
||||
"Resolution out of range [{}, {}]", H3_REQUEST_MIN, H3_REQUEST_MAX
|
||||
);
|
||||
return Err((
|
||||
StatusCode::BAD_REQUEST,
|
||||
format!(
|
||||
"resolution must be between {} and {}",
|
||||
H3_REQUEST_MIN, H3_REQUEST_MAX
|
||||
),
|
||||
));
|
||||
}
|
||||
validate_h3_resolution(resolution)?;
|
||||
|
||||
let h3_str = params.h3.clone();
|
||||
let filters_str = params.filters.clone();
|
||||
let (parsed_filters, parsed_enum_filters) = parse_filters(
|
||||
|
|
@ -100,48 +93,25 @@ pub async fn get_hexagon_stats(
|
|||
);
|
||||
let num_filters = parsed_filters.len() + parsed_enum_filters.len();
|
||||
|
||||
let fields_specified = params.fields.is_some();
|
||||
let field_set: std::collections::HashSet<String> = params
|
||||
.fields
|
||||
.as_ref()
|
||||
.map(|fields_str| {
|
||||
fields_str
|
||||
.split(',')
|
||||
.map(|field| field.trim().to_string())
|
||||
.filter(|field| !field.is_empty())
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
let (fields_specified, field_set) = parse_field_set(params.fields.as_deref());
|
||||
|
||||
let response = tokio::task::spawn_blocking(move || {
|
||||
let start_time = std::time::Instant::now();
|
||||
let precomputed = &state.h3_cells;
|
||||
let h3_res = h3o::Resolution::try_from(resolution)
|
||||
.map_err(|err| format!("Invalid H3 resolution {}: {}", resolution, err))?;
|
||||
let need_parent = resolution < H3_PRECOMPUTE_MAX;
|
||||
let need_parent = needs_parent(resolution);
|
||||
let num_features = state.data.num_features;
|
||||
let feature_data = &state.data.feature_data;
|
||||
|
||||
let (min_lat, min_lon, max_lat, max_lon) = h3_cell_bounds(cell, 0.001);
|
||||
|
||||
let cell_for_row = |row: usize| -> u64 {
|
||||
let max_cell = precomputed[row];
|
||||
if !need_parent || max_cell == 0 {
|
||||
return max_cell;
|
||||
}
|
||||
h3o::CellIndex::try_from(max_cell)
|
||||
.ok()
|
||||
.and_then(|ci| ci.parent(h3_res))
|
||||
.map(u64::from)
|
||||
.unwrap_or(0)
|
||||
};
|
||||
|
||||
let mut matching_rows: Vec<usize> = Vec::new();
|
||||
state
|
||||
.grid
|
||||
.for_each_in_bounds(min_lat, min_lon, max_lat, max_lon, |row_idx| {
|
||||
let row = row_idx as usize;
|
||||
if cell_for_row(row) == cell_u64
|
||||
if cell_for_row(row, precomputed, h3_res, need_parent) == cell_u64
|
||||
&& row_passes_filters(
|
||||
row,
|
||||
&parsed_filters,
|
||||
|
|
@ -156,149 +126,23 @@ pub async fn get_hexagon_stats(
|
|||
|
||||
let total_count = matching_rows.len();
|
||||
|
||||
// Collect price history (year, price) pairs
|
||||
let price_history = {
|
||||
let year_idx = state
|
||||
.feature_name_to_index
|
||||
.get("Date of last transaction")
|
||||
.copied();
|
||||
let price_idx = state.feature_name_to_index.get("Last known price").copied();
|
||||
match (year_idx, price_idx) {
|
||||
(Some(yi), Some(pi)) => {
|
||||
let mut points: Vec<PricePoint> = matching_rows
|
||||
.iter()
|
||||
.filter_map(|&row| {
|
||||
let year = feature_data[row * num_features + yi];
|
||||
let price = feature_data[row * num_features + pi];
|
||||
if year.is_finite() && price.is_finite() {
|
||||
Some(PricePoint { year, price })
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
// Cap at 5000 points by evenly sampling
|
||||
if points.len() > 5000 {
|
||||
let step = points.len() as f64 / 5000.0;
|
||||
points = (0..5000)
|
||||
.map(|i| {
|
||||
let idx = (i as f64 * step) as usize;
|
||||
PricePoint {
|
||||
year: points[idx].year,
|
||||
price: points[idx].price,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
}
|
||||
points
|
||||
}
|
||||
_ => Vec::new(),
|
||||
}
|
||||
};
|
||||
let price_history = stats::extract_price_history(
|
||||
&matching_rows,
|
||||
feature_data,
|
||||
num_features,
|
||||
&state.feature_name_to_index,
|
||||
);
|
||||
|
||||
let mut numeric_features = Vec::new();
|
||||
let mut enum_features_out = Vec::new();
|
||||
|
||||
for (feature_index, feature_name) in state.data.feature_names.iter().enumerate() {
|
||||
if fields_specified && !field_set.contains(feature_name.as_str()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if this is an enum feature
|
||||
if let Some(enum_values) = state.data.enum_values.get(&feature_index) {
|
||||
// Enum feature: count occurrences of each value
|
||||
let mut value_counts = vec![0u64; enum_values.len()];
|
||||
for &row in &matching_rows {
|
||||
let value = feature_data[row * num_features + feature_index];
|
||||
if value.is_finite() {
|
||||
let idx = value as usize;
|
||||
if idx < value_counts.len() {
|
||||
value_counts[idx] += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let counts: HashMap<String, u64> = value_counts
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, &count)| count > 0)
|
||||
.map(|(idx, &count)| (enum_values[idx].clone(), count))
|
||||
.collect();
|
||||
|
||||
if !counts.is_empty() {
|
||||
enum_features_out.push(EnumFeatureStats {
|
||||
name: feature_name.clone(),
|
||||
counts,
|
||||
});
|
||||
}
|
||||
} else {
|
||||
// Numeric feature: compute stats and histogram
|
||||
let global_hist = &state.data.feature_stats[feature_index].histogram;
|
||||
let p1 = global_hist.p1;
|
||||
let p99 = global_hist.p99;
|
||||
// Use same bin count as global histogram for consistency
|
||||
let num_bins = global_hist.counts.len();
|
||||
|
||||
let mut count = 0usize;
|
||||
let mut min_value = f32::INFINITY;
|
||||
let mut max_value = f32::NEG_INFINITY;
|
||||
let mut sum = 0.0f64;
|
||||
let mut bins = vec![0u64; num_bins];
|
||||
|
||||
// Compute middle bin width (between p1 and p99)
|
||||
let middle_bins = num_bins.saturating_sub(2);
|
||||
let middle_width = if middle_bins > 0 && p99 > p1 {
|
||||
(p99 - p1) / middle_bins as f32
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
for &row in &matching_rows {
|
||||
let value = feature_data[row * num_features + feature_index];
|
||||
if value.is_finite() {
|
||||
count += 1;
|
||||
if value < min_value {
|
||||
min_value = value;
|
||||
}
|
||||
if value > max_value {
|
||||
max_value = value;
|
||||
}
|
||||
sum += value as f64;
|
||||
|
||||
// Bin using p1/p99 outlier structure
|
||||
let bin = if value < p1 {
|
||||
0 // Low outlier bin
|
||||
} else if value >= p99 {
|
||||
num_bins - 1 // High outlier bin
|
||||
} else if middle_width > 0.0 {
|
||||
// Middle bins (1 to n-2)
|
||||
let middle_bin = ((value - p1) / middle_width) as usize;
|
||||
(1 + middle_bin).min(num_bins - 2)
|
||||
} else {
|
||||
num_bins / 2 // Fallback if p1 == p99
|
||||
};
|
||||
bins[bin] += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if count > 0 {
|
||||
numeric_features.push(NumericFeatureStats {
|
||||
name: feature_name.clone(),
|
||||
count,
|
||||
min: min_value as f64,
|
||||
max: max_value as f64,
|
||||
mean: sum / count as f64,
|
||||
histogram: HistogramStats {
|
||||
min: global_hist.min as f64,
|
||||
max: global_hist.max as f64,
|
||||
p1: p1 as f64,
|
||||
p99: p99 as f64,
|
||||
counts: bins,
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
let (numeric_features, enum_features_out) = stats::compute_feature_stats(
|
||||
&matching_rows,
|
||||
feature_data,
|
||||
&state.data.feature_names,
|
||||
num_features,
|
||||
&state.data.enum_values,
|
||||
&state.data.feature_stats,
|
||||
fields_specified,
|
||||
&field_set,
|
||||
);
|
||||
|
||||
let elapsed = start_time.elapsed();
|
||||
info!(
|
||||
|
|
|
|||
|
|
@ -6,11 +6,13 @@ use axum::response::Json;
|
|||
use rustc_hash::FxHashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::{Map, Value};
|
||||
use tracing::{info, warn};
|
||||
use tracing::info;
|
||||
|
||||
use crate::consts::{H3_PRECOMPUTE_MAX, H3_REQUEST_MAX, H3_REQUEST_MIN, MAX_CELLS_PER_REQUEST};
|
||||
use crate::aggregation::Aggregator;
|
||||
use crate::consts::MAX_CELLS_PER_REQUEST;
|
||||
use crate::parsing::{
|
||||
bounds_intersect, h3_cell_bounds, parse_bounds, parse_filters, row_passes_filters,
|
||||
bounds_intersect, cell_for_row, h3_cell_bounds, needs_parent, parse_field_indices,
|
||||
parse_filters, require_bounds, row_passes_filters, validate_h3_resolution,
|
||||
};
|
||||
use crate::state::AppState;
|
||||
|
||||
|
|
@ -32,79 +34,9 @@ pub struct HexagonParams {
|
|||
fields: Option<String>,
|
||||
}
|
||||
|
||||
/// Per-cell accumulator for aggregating features.
|
||||
/// Uses Box<[T]> instead of Vec<T> to avoid storing capacity (saves 8 bytes per field per cell).
|
||||
struct CellAgg {
|
||||
count: u32,
|
||||
mins: Box<[f32]>,
|
||||
maxs: Box<[f32]>,
|
||||
sums: Box<[f64]>,
|
||||
feat_counts: Box<[u32]>,
|
||||
}
|
||||
|
||||
impl CellAgg {
|
||||
fn new(num_features: usize) -> Self {
|
||||
CellAgg {
|
||||
count: 0,
|
||||
mins: vec![f32::INFINITY; num_features].into_boxed_slice(),
|
||||
maxs: vec![f32::NEG_INFINITY; num_features].into_boxed_slice(),
|
||||
sums: vec![0.0f64; num_features].into_boxed_slice(),
|
||||
feat_counts: vec![0u32; num_features].into_boxed_slice(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a row using row-major feature_data layout.
|
||||
/// feature_data[row * num_features + feat_idx] — all features for one row
|
||||
/// are contiguous, so this reads a single cache line per ~8 features.
|
||||
#[inline]
|
||||
fn add_row(&mut self, feature_data: &[f32], row: usize, num_features: usize) {
|
||||
self.count += 1;
|
||||
let base = row * num_features;
|
||||
let row_slice = &feature_data[base..base + num_features];
|
||||
for (feat_index, &value) in row_slice.iter().enumerate() {
|
||||
if value.is_finite() {
|
||||
if value < self.mins[feat_index] {
|
||||
self.mins[feat_index] = value;
|
||||
}
|
||||
if value > self.maxs[feat_index] {
|
||||
self.maxs[feat_index] = value;
|
||||
}
|
||||
self.sums[feat_index] += value as f64;
|
||||
self.feat_counts[feat_index] += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a row, only aggregating the features at the given indices.
|
||||
#[inline]
|
||||
fn add_row_selective(
|
||||
&mut self,
|
||||
feature_data: &[f32],
|
||||
row: usize,
|
||||
num_features: usize,
|
||||
indices: &[usize],
|
||||
) {
|
||||
self.count += 1;
|
||||
let base = row * num_features;
|
||||
for &feat_index in indices {
|
||||
let value = feature_data[base + feat_index];
|
||||
if value.is_finite() {
|
||||
if value < self.mins[feat_index] {
|
||||
self.mins[feat_index] = value;
|
||||
}
|
||||
if value > self.maxs[feat_index] {
|
||||
self.maxs[feat_index] = value;
|
||||
}
|
||||
self.sums[feat_index] += value as f64;
|
||||
self.feat_counts[feat_index] += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Build feature maps from aggregated cell data, filtering to only cells that intersect the query bounds.
|
||||
fn build_feature_maps(
|
||||
groups: &FxHashMap<u64, CellAgg>,
|
||||
groups: &FxHashMap<u64, Aggregator>,
|
||||
min_keys: &[String],
|
||||
max_keys: &[String],
|
||||
avg_keys: &[String],
|
||||
|
|
@ -172,26 +104,9 @@ pub async fn get_hexagons(
|
|||
Query(params): Query<HexagonParams>,
|
||||
) -> Result<Json<HexagonsResponse>, (StatusCode, String)> {
|
||||
let resolution = params.resolution;
|
||||
if !(H3_REQUEST_MIN..=H3_REQUEST_MAX).contains(&resolution) {
|
||||
warn!(
|
||||
resolution,
|
||||
"Resolution out of range [{}, {}]", H3_REQUEST_MIN, H3_REQUEST_MAX
|
||||
);
|
||||
return Err((
|
||||
StatusCode::BAD_REQUEST,
|
||||
format!(
|
||||
"resolution must be between {} and {}",
|
||||
H3_REQUEST_MIN, H3_REQUEST_MAX
|
||||
),
|
||||
));
|
||||
}
|
||||
validate_h3_resolution(resolution)?;
|
||||
|
||||
let bounds_str = params.bounds.ok_or((
|
||||
StatusCode::BAD_REQUEST,
|
||||
"bounds parameter is required".into(),
|
||||
))?;
|
||||
|
||||
let (south, west, north, east) = parse_bounds(&bounds_str)?;
|
||||
let (south, west, north, east) = require_bounds(params.bounds)?;
|
||||
|
||||
let filters_str = params.filters.clone();
|
||||
let (parsed_filters, parsed_enum_filters) = parse_filters(
|
||||
|
|
@ -201,24 +116,7 @@ pub async fn get_hexagons(
|
|||
);
|
||||
let num_filters = parsed_filters.len() + parsed_enum_filters.len();
|
||||
|
||||
// Parse optional `fields` param into feature indices.
|
||||
// If `fields` is absent (None), all features are included.
|
||||
// If `fields` is present (even empty string), only listed features are included.
|
||||
let field_indices: Option<Vec<usize>> = params.fields.as_ref().map(|fields_str| {
|
||||
if fields_str.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
fields_str
|
||||
.split(',')
|
||||
.filter_map(|name| {
|
||||
let name = name.trim();
|
||||
if name.is_empty() {
|
||||
return None;
|
||||
}
|
||||
state.feature_name_to_index.get(name).copied()
|
||||
})
|
||||
.collect()
|
||||
});
|
||||
let field_indices = parse_field_indices(params.fields.as_deref(), &state.feature_name_to_index);
|
||||
|
||||
let response = tokio::task::spawn_blocking(move || -> Result<HexagonsResponse, String> {
|
||||
let t0 = std::time::Instant::now();
|
||||
|
|
@ -232,21 +130,9 @@ pub async fn get_hexagons(
|
|||
let h3_res = h3o::Resolution::try_from(resolution)
|
||||
.map_err(|error| format!("Invalid H3 resolution {}: {}", resolution, error))?;
|
||||
let precomputed = &state.h3_cells;
|
||||
let need_parent = resolution < H3_PRECOMPUTE_MAX;
|
||||
let need_parent = needs_parent(resolution);
|
||||
|
||||
let mut groups: FxHashMap<u64, CellAgg> = FxHashMap::default();
|
||||
|
||||
let cell_for_row = |row: usize| -> u64 {
|
||||
let max_cell = precomputed[row];
|
||||
if !need_parent || max_cell == 0 {
|
||||
return max_cell;
|
||||
}
|
||||
h3o::CellIndex::try_from(max_cell)
|
||||
.ok()
|
||||
.and_then(|ci| ci.parent(h3_res))
|
||||
.map(u64::from)
|
||||
.unwrap_or(0)
|
||||
};
|
||||
let mut groups: FxHashMap<u64, Aggregator> = FxHashMap::default();
|
||||
|
||||
// Hoist has_selective branch outside the hot loop to avoid per-row branching
|
||||
if let Some(sel_indices) = field_indices.as_deref() {
|
||||
|
|
@ -263,10 +149,10 @@ pub async fn get_hexagons(
|
|||
) {
|
||||
return;
|
||||
}
|
||||
let cell_id = cell_for_row(row);
|
||||
let cell_id = cell_for_row(row, precomputed, h3_res, need_parent);
|
||||
let aggregation = groups
|
||||
.entry(cell_id)
|
||||
.or_insert_with(|| CellAgg::new(num_features));
|
||||
.or_insert_with(|| Aggregator::new(num_features));
|
||||
aggregation.add_row_selective(feature_data, row, num_features, sel_indices);
|
||||
});
|
||||
} else {
|
||||
|
|
@ -283,10 +169,10 @@ pub async fn get_hexagons(
|
|||
) {
|
||||
return;
|
||||
}
|
||||
let cell_id = cell_for_row(row);
|
||||
let cell_id = cell_for_row(row, precomputed, h3_res, need_parent);
|
||||
let aggregation = groups
|
||||
.entry(cell_id)
|
||||
.or_insert_with(|| CellAgg::new(num_features));
|
||||
.or_insert_with(|| Aggregator::new(num_features));
|
||||
aggregation.add_row(feature_data, row, num_features);
|
||||
});
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ use tracing::info;
|
|||
|
||||
use crate::consts::MAX_POIS_PER_REQUEST;
|
||||
use crate::data::POICategoryGroup;
|
||||
use crate::parsing::parse_bounds;
|
||||
use crate::parsing::require_bounds;
|
||||
use crate::state::AppState;
|
||||
|
||||
#[derive(Serialize)]
|
||||
|
|
@ -39,12 +39,7 @@ pub async fn get_pois(
|
|||
state: Arc<AppState>,
|
||||
Query(params): Query<POIParams>,
|
||||
) -> Result<Json<POIsResponse>, (StatusCode, String)> {
|
||||
let bounds_str = params.bounds.ok_or((
|
||||
StatusCode::BAD_REQUEST,
|
||||
"bounds parameter is required".into(),
|
||||
))?;
|
||||
|
||||
let (south, west, north, east) = parse_bounds(&bounds_str)?;
|
||||
let (south, west, north, east) = require_bounds(params.bounds)?;
|
||||
|
||||
let categories_str = params.categories.clone();
|
||||
let category_filter: Option<rustc_hash::FxHashSet<String>> = params
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::extract::Query;
|
||||
|
|
@ -7,12 +6,12 @@ use axum::response::Json;
|
|||
use serde::Deserialize;
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::parsing::{parse_filters, row_passes_filters};
|
||||
use crate::consts::POSTCODE_SEARCH_OFFSET;
|
||||
use crate::parsing::{parse_field_set, parse_filters, row_passes_filters};
|
||||
use crate::state::AppState;
|
||||
|
||||
use super::hexagon_stats::{
|
||||
EnumFeatureStats, HexagonStatsResponse, HistogramStats, NumericFeatureStats, PricePoint,
|
||||
};
|
||||
use super::hexagon_stats::HexagonStatsResponse;
|
||||
use super::stats;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
pub struct PostcodeStatsParams {
|
||||
|
|
@ -56,18 +55,7 @@ pub async fn get_postcode_stats(
|
|||
);
|
||||
let num_filters = parsed_filters.len() + parsed_enum_filters.len();
|
||||
|
||||
let fields_specified = params.fields.is_some();
|
||||
let field_set: std::collections::HashSet<String> = params
|
||||
.fields
|
||||
.as_ref()
|
||||
.map(|fields_str| {
|
||||
fields_str
|
||||
.split(',')
|
||||
.map(|field| field.trim().to_string())
|
||||
.filter(|field| !field.is_empty())
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
let (fields_specified, field_set) = parse_field_set(params.fields.as_deref());
|
||||
|
||||
let postcode_str = normalized.clone();
|
||||
|
||||
|
|
@ -76,8 +64,8 @@ pub async fn get_postcode_stats(
|
|||
let num_features = state.data.num_features;
|
||||
let feature_data = &state.data.feature_data;
|
||||
|
||||
// Search ±0.02° around centroid (~2km, generous for a postcode)
|
||||
let offset: f64 = 0.02;
|
||||
// Search around centroid (generous for a postcode)
|
||||
let offset: f64 = POSTCODE_SEARCH_OFFSET;
|
||||
let min_lat = centroid_lat as f64 - offset;
|
||||
let max_lat = centroid_lat as f64 + offset;
|
||||
let min_lon = centroid_lon as f64 - offset;
|
||||
|
|
@ -104,144 +92,23 @@ pub async fn get_postcode_stats(
|
|||
|
||||
let total_count = matching_rows.len();
|
||||
|
||||
// Collect price history (year, price) pairs
|
||||
let price_history = {
|
||||
let year_idx = state
|
||||
.feature_name_to_index
|
||||
.get("Date of last transaction")
|
||||
.copied();
|
||||
let price_idx = state.feature_name_to_index.get("Last known price").copied();
|
||||
match (year_idx, price_idx) {
|
||||
(Some(yi), Some(pi)) => {
|
||||
let mut points: Vec<PricePoint> = matching_rows
|
||||
.iter()
|
||||
.filter_map(|&row| {
|
||||
let year = feature_data[row * num_features + yi];
|
||||
let price = feature_data[row * num_features + pi];
|
||||
if year.is_finite() && price.is_finite() {
|
||||
Some(PricePoint { year, price })
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
// Cap at 5000 points by evenly sampling
|
||||
if points.len() > 5000 {
|
||||
let step = points.len() as f64 / 5000.0;
|
||||
points = (0..5000)
|
||||
.map(|i| {
|
||||
let idx = (i as f64 * step) as usize;
|
||||
PricePoint {
|
||||
year: points[idx].year,
|
||||
price: points[idx].price,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
}
|
||||
points
|
||||
}
|
||||
_ => Vec::new(),
|
||||
}
|
||||
};
|
||||
let price_history = stats::extract_price_history(
|
||||
&matching_rows,
|
||||
feature_data,
|
||||
num_features,
|
||||
&state.feature_name_to_index,
|
||||
);
|
||||
|
||||
let mut numeric_features = Vec::new();
|
||||
let mut enum_features_out = Vec::new();
|
||||
|
||||
for (feature_index, feature_name) in state.data.feature_names.iter().enumerate() {
|
||||
if fields_specified && !field_set.contains(feature_name.as_str()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(enum_values) = state.data.enum_values.get(&feature_index) {
|
||||
// Enum feature: count occurrences of each value
|
||||
let mut value_counts = vec![0u64; enum_values.len()];
|
||||
for &row in &matching_rows {
|
||||
let value = feature_data[row * num_features + feature_index];
|
||||
if value.is_finite() {
|
||||
let idx = value as usize;
|
||||
if idx < value_counts.len() {
|
||||
value_counts[idx] += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let counts: HashMap<String, u64> = value_counts
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, &count)| count > 0)
|
||||
.map(|(idx, &count)| (enum_values[idx].clone(), count))
|
||||
.collect();
|
||||
|
||||
if !counts.is_empty() {
|
||||
enum_features_out.push(EnumFeatureStats {
|
||||
name: feature_name.clone(),
|
||||
counts,
|
||||
});
|
||||
}
|
||||
} else {
|
||||
// Numeric feature: compute stats and histogram
|
||||
let global_hist = &state.data.feature_stats[feature_index].histogram;
|
||||
let p1 = global_hist.p1;
|
||||
let p99 = global_hist.p99;
|
||||
let num_bins = global_hist.counts.len();
|
||||
|
||||
let mut count = 0usize;
|
||||
let mut min_value = f32::INFINITY;
|
||||
let mut max_value = f32::NEG_INFINITY;
|
||||
let mut sum = 0.0f64;
|
||||
let mut bins = vec![0u64; num_bins];
|
||||
|
||||
let middle_bins = num_bins.saturating_sub(2);
|
||||
let middle_width = if middle_bins > 0 && p99 > p1 {
|
||||
(p99 - p1) / middle_bins as f32
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
for &row in &matching_rows {
|
||||
let value = feature_data[row * num_features + feature_index];
|
||||
if value.is_finite() {
|
||||
count += 1;
|
||||
if value < min_value {
|
||||
min_value = value;
|
||||
}
|
||||
if value > max_value {
|
||||
max_value = value;
|
||||
}
|
||||
sum += value as f64;
|
||||
|
||||
let bin = if value < p1 {
|
||||
0
|
||||
} else if value >= p99 {
|
||||
num_bins - 1
|
||||
} else if middle_width > 0.0 {
|
||||
let middle_bin = ((value - p1) / middle_width) as usize;
|
||||
(1 + middle_bin).min(num_bins - 2)
|
||||
} else {
|
||||
num_bins / 2
|
||||
};
|
||||
bins[bin] += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if count > 0 {
|
||||
numeric_features.push(NumericFeatureStats {
|
||||
name: feature_name.clone(),
|
||||
count,
|
||||
min: min_value as f64,
|
||||
max: max_value as f64,
|
||||
mean: sum / count as f64,
|
||||
histogram: HistogramStats {
|
||||
min: global_hist.min as f64,
|
||||
max: global_hist.max as f64,
|
||||
p1: p1 as f64,
|
||||
p99: p99 as f64,
|
||||
counts: bins,
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
let (numeric_features, enum_features_out) = stats::compute_feature_stats(
|
||||
&matching_rows,
|
||||
feature_data,
|
||||
&state.data.feature_names,
|
||||
num_features,
|
||||
&state.data.enum_values,
|
||||
&state.data.feature_stats,
|
||||
fields_specified,
|
||||
&field_set,
|
||||
);
|
||||
|
||||
let elapsed = start_time.elapsed();
|
||||
info!(
|
||||
|
|
|
|||
|
|
@ -8,8 +8,11 @@ use serde::{Deserialize, Serialize};
|
|||
use serde_json::{Map, Value};
|
||||
use tracing::info;
|
||||
|
||||
use crate::aggregation::Aggregator;
|
||||
use crate::consts::MAX_CELLS_PER_REQUEST;
|
||||
use crate::parsing::{bounds_intersect, parse_bounds, parse_filters, row_passes_filters};
|
||||
use crate::parsing::{
|
||||
bounds_intersect, parse_field_indices, parse_filters, require_bounds, row_passes_filters,
|
||||
};
|
||||
use crate::state::AppState;
|
||||
|
||||
#[derive(Serialize)]
|
||||
|
|
@ -27,68 +30,31 @@ pub struct PostcodeParams {
|
|||
fields: Option<String>,
|
||||
}
|
||||
|
||||
/// Per-postcode accumulator for aggregating features.
|
||||
struct PostcodeAgg {
|
||||
count: u32,
|
||||
mins: Box<[f32]>,
|
||||
maxs: Box<[f32]>,
|
||||
sums: Box<[f64]>,
|
||||
feat_counts: Box<[u32]>,
|
||||
}
|
||||
|
||||
impl PostcodeAgg {
|
||||
fn new(num_features: usize) -> Self {
|
||||
PostcodeAgg {
|
||||
count: 0,
|
||||
mins: vec![f32::INFINITY; num_features].into_boxed_slice(),
|
||||
maxs: vec![f32::NEG_INFINITY; num_features].into_boxed_slice(),
|
||||
sums: vec![0.0f64; num_features].into_boxed_slice(),
|
||||
feat_counts: vec![0u32; num_features].into_boxed_slice(),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn add_row(&mut self, feature_data: &[f32], row: usize, num_features: usize) {
|
||||
self.count += 1;
|
||||
let base = row * num_features;
|
||||
let row_slice = &feature_data[base..base + num_features];
|
||||
for (feat_index, &value) in row_slice.iter().enumerate() {
|
||||
if value.is_finite() {
|
||||
if value < self.mins[feat_index] {
|
||||
self.mins[feat_index] = value;
|
||||
}
|
||||
if value > self.maxs[feat_index] {
|
||||
self.maxs[feat_index] = value;
|
||||
}
|
||||
self.sums[feat_index] += value as f64;
|
||||
self.feat_counts[feat_index] += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn add_row_selective(
|
||||
&mut self,
|
||||
feature_data: &[f32],
|
||||
row: usize,
|
||||
num_features: usize,
|
||||
indices: &[usize],
|
||||
) {
|
||||
self.count += 1;
|
||||
let base = row * num_features;
|
||||
for &feat_index in indices {
|
||||
let value = feature_data[base + feat_index];
|
||||
if value.is_finite() {
|
||||
if value < self.mins[feat_index] {
|
||||
self.mins[feat_index] = value;
|
||||
}
|
||||
if value > self.maxs[feat_index] {
|
||||
self.maxs[feat_index] = value;
|
||||
}
|
||||
self.sums[feat_index] += value as f64;
|
||||
self.feat_counts[feat_index] += 1;
|
||||
}
|
||||
}
|
||||
/// Build a GeoJSON geometry object from postcode polygon rings.
|
||||
/// Returns Polygon for 1 ring, MultiPolygon for 2+ rings.
|
||||
fn build_postcode_geometry(rings: &[Vec<[f32; 2]>]) -> Value {
|
||||
if rings.len() == 1 {
|
||||
let coords: Vec<Value> = rings[0]
|
||||
.iter()
|
||||
.map(|[lon, lat]| {
|
||||
Value::Array(vec![Value::from(*lon as f64), Value::from(*lat as f64)])
|
||||
})
|
||||
.collect();
|
||||
serde_json::json!({ "type": "Polygon", "coordinates": [coords] })
|
||||
} else {
|
||||
let polys: Vec<Value> = rings
|
||||
.iter()
|
||||
.map(|ring| {
|
||||
let coords: Vec<Value> = ring
|
||||
.iter()
|
||||
.map(|[lon, lat]| {
|
||||
Value::Array(vec![Value::from(*lon as f64), Value::from(*lat as f64)])
|
||||
})
|
||||
.collect();
|
||||
Value::Array(vec![Value::Array(coords)])
|
||||
})
|
||||
.collect();
|
||||
serde_json::json!({ "type": "MultiPolygon", "coordinates": polys })
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -96,12 +62,7 @@ pub async fn get_postcodes(
|
|||
state: Arc<AppState>,
|
||||
Query(params): Query<PostcodeParams>,
|
||||
) -> Result<Json<PostcodesResponse>, (StatusCode, String)> {
|
||||
let bounds_str = params.bounds.ok_or((
|
||||
StatusCode::BAD_REQUEST,
|
||||
"bounds parameter is required".into(),
|
||||
))?;
|
||||
|
||||
let (south, west, north, east) = parse_bounds(&bounds_str)?;
|
||||
let (south, west, north, east) = require_bounds(params.bounds)?;
|
||||
|
||||
let filters_str = params.filters.clone();
|
||||
let (parsed_filters, parsed_enum_filters) = parse_filters(
|
||||
|
|
@ -111,22 +72,7 @@ pub async fn get_postcodes(
|
|||
);
|
||||
let num_filters = parsed_filters.len() + parsed_enum_filters.len();
|
||||
|
||||
// Parse optional `fields` param into feature indices
|
||||
let field_indices: Option<Vec<usize>> = params.fields.as_ref().map(|fields_str| {
|
||||
if fields_str.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
fields_str
|
||||
.split(',')
|
||||
.filter_map(|name| {
|
||||
let name = name.trim();
|
||||
if name.is_empty() {
|
||||
return None;
|
||||
}
|
||||
state.feature_name_to_index.get(name).copied()
|
||||
})
|
||||
.collect()
|
||||
});
|
||||
let field_indices = parse_field_indices(params.fields.as_deref(), &state.feature_name_to_index);
|
||||
|
||||
let response = tokio::task::spawn_blocking(move || -> Result<PostcodesResponse, String> {
|
||||
let postcode_data = &state.postcode_data;
|
||||
|
|
@ -168,11 +114,11 @@ pub async fn get_postcodes(
|
|||
|
||||
// Aggregate for each postcode that has properties in bounds
|
||||
// (polygon intersection check happens later when building response)
|
||||
let mut postcode_aggs: FxHashMap<usize, PostcodeAgg> = FxHashMap::default();
|
||||
let mut postcode_aggs: FxHashMap<usize, Aggregator> = FxHashMap::default();
|
||||
for (&pc_idx, rows) in &postcode_rows {
|
||||
let agg = postcode_aggs
|
||||
.entry(pc_idx)
|
||||
.or_insert_with(|| PostcodeAgg::new(num_features));
|
||||
.or_insert_with(|| Aggregator::new(num_features));
|
||||
for &row in rows {
|
||||
if has_selective {
|
||||
agg.add_row_selective(feature_data, row, num_features, sel_indices);
|
||||
|
|
@ -222,42 +168,7 @@ pub async fn get_postcodes(
|
|||
continue;
|
||||
}
|
||||
|
||||
// Build GeoJSON geometry: Polygon (1 ring) or MultiPolygon (2+ rings)
|
||||
let geometry = if rings.len() == 1 {
|
||||
let coords: Vec<Value> = rings[0]
|
||||
.iter()
|
||||
.map(|[lon, lat]| {
|
||||
Value::Array(vec![Value::from(*lon as f64), Value::from(*lat as f64)])
|
||||
})
|
||||
.collect();
|
||||
let mut geo = Map::new();
|
||||
geo.insert("type".into(), Value::String("Polygon".into()));
|
||||
geo.insert(
|
||||
"coordinates".into(),
|
||||
Value::Array(vec![Value::Array(coords)]),
|
||||
);
|
||||
geo
|
||||
} else {
|
||||
let polys: Vec<Value> = rings
|
||||
.iter()
|
||||
.map(|ring| {
|
||||
let coords: Vec<Value> = ring
|
||||
.iter()
|
||||
.map(|[lon, lat]| {
|
||||
Value::Array(vec![
|
||||
Value::from(*lon as f64),
|
||||
Value::from(*lat as f64),
|
||||
])
|
||||
})
|
||||
.collect();
|
||||
Value::Array(vec![Value::Array(coords)])
|
||||
})
|
||||
.collect();
|
||||
let mut geo = Map::new();
|
||||
geo.insert("type".into(), Value::String("MultiPolygon".into()));
|
||||
geo.insert("coordinates".into(), Value::Array(polys));
|
||||
geo
|
||||
};
|
||||
let geometry = build_postcode_geometry(rings);
|
||||
|
||||
// Build properties
|
||||
let centroid = postcode_data.centroids[pc_idx];
|
||||
|
|
@ -300,7 +211,7 @@ pub async fn get_postcodes(
|
|||
// Build GeoJSON Feature
|
||||
let mut feature = Map::new();
|
||||
feature.insert("type".into(), Value::String("Feature".into()));
|
||||
feature.insert("geometry".into(), Value::Object(geometry));
|
||||
feature.insert("geometry".into(), geometry);
|
||||
feature.insert("properties".into(), Value::Object(props));
|
||||
|
||||
features.push(feature);
|
||||
|
|
@ -353,31 +264,7 @@ pub async fn get_postcode_lookup(
|
|||
if let Some(&idx) = postcode_data.postcode_to_idx.get(&normalized) {
|
||||
let (lat, lon) = postcode_data.centroids[idx];
|
||||
let rings = &postcode_data.polygons[idx];
|
||||
|
||||
// Build GeoJSON geometry
|
||||
let geometry = if rings.len() == 1 {
|
||||
let coords: Vec<Value> = rings[0]
|
||||
.iter()
|
||||
.map(|[lo, la]| {
|
||||
Value::Array(vec![Value::from(*lo as f64), Value::from(*la as f64)])
|
||||
})
|
||||
.collect();
|
||||
serde_json::json!({ "type": "Polygon", "coordinates": [coords] })
|
||||
} else {
|
||||
let polys: Vec<Value> = rings
|
||||
.iter()
|
||||
.map(|ring| {
|
||||
let coords: Vec<Value> = ring
|
||||
.iter()
|
||||
.map(|[lo, la]| {
|
||||
Value::Array(vec![Value::from(*lo as f64), Value::from(*la as f64)])
|
||||
})
|
||||
.collect();
|
||||
Value::Array(vec![Value::Array(coords)])
|
||||
})
|
||||
.collect();
|
||||
serde_json::json!({ "type": "MultiPolygon", "coordinates": polys })
|
||||
};
|
||||
let geometry = build_postcode_geometry(rings);
|
||||
|
||||
info!(postcode = %normalized, "GET /api/postcode/{postcode}");
|
||||
Ok(Json(serde_json::json!({
|
||||
|
|
|
|||
|
|
@ -8,11 +8,11 @@ use rustc_hash::FxHashMap;
|
|||
use serde::{Deserialize, Serialize};
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::consts::{
|
||||
DEFAULT_PROPERTIES_LIMIT, H3_PRECOMPUTE_MAX, H3_REQUEST_MAX, H3_REQUEST_MIN,
|
||||
MAX_PROPERTIES_LIMIT,
|
||||
use crate::consts::{DEFAULT_PROPERTIES_LIMIT, MAX_PROPERTIES_LIMIT};
|
||||
use crate::parsing::{
|
||||
cell_for_row, h3_cell_bounds, needs_parent, parse_filters, row_passes_filters,
|
||||
validate_h3_resolution,
|
||||
};
|
||||
use crate::parsing::{h3_cell_bounds, parse_filters, row_passes_filters};
|
||||
use crate::state::AppState;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
|
|
@ -103,19 +103,8 @@ pub async fn get_hexagon_properties(
|
|||
let cell_u64: u64 = cell.into();
|
||||
|
||||
let resolution = params.resolution;
|
||||
if !(H3_REQUEST_MIN..=H3_REQUEST_MAX).contains(&resolution) {
|
||||
warn!(
|
||||
resolution,
|
||||
"Resolution out of range [{}, {}]", H3_REQUEST_MIN, H3_REQUEST_MAX
|
||||
);
|
||||
return Err((
|
||||
StatusCode::BAD_REQUEST,
|
||||
format!(
|
||||
"resolution must be between {} and {}",
|
||||
H3_REQUEST_MIN, H3_REQUEST_MAX
|
||||
),
|
||||
));
|
||||
}
|
||||
validate_h3_resolution(resolution)?;
|
||||
|
||||
let h3_str = params.h3.clone();
|
||||
let filters_str = params.filters.clone();
|
||||
let (parsed_filters, parsed_enum_filters) = parse_filters(
|
||||
|
|
@ -130,7 +119,7 @@ pub async fn get_hexagon_properties(
|
|||
let precomputed = &state.h3_cells;
|
||||
let h3_res = h3o::Resolution::try_from(resolution)
|
||||
.map_err(|err| format!("Invalid H3 resolution {}: {}", resolution, err))?;
|
||||
let need_parent = resolution < H3_PRECOMPUTE_MAX;
|
||||
let need_parent = needs_parent(resolution);
|
||||
let num_features = state.data.num_features;
|
||||
let feature_data = &state.data.feature_data;
|
||||
let feature_names = &state.data.feature_names;
|
||||
|
|
@ -139,24 +128,12 @@ pub async fn get_hexagon_properties(
|
|||
|
||||
let (min_lat, min_lon, max_lat, max_lon) = h3_cell_bounds(cell, 0.001);
|
||||
|
||||
let cell_for_row = |row: usize| -> u64 {
|
||||
let max_cell = precomputed[row];
|
||||
if !need_parent || max_cell == 0 {
|
||||
return max_cell;
|
||||
}
|
||||
h3o::CellIndex::try_from(max_cell)
|
||||
.ok()
|
||||
.and_then(|ci| ci.parent(h3_res))
|
||||
.map(u64::from)
|
||||
.unwrap_or(0)
|
||||
};
|
||||
|
||||
let mut matching_rows: Vec<usize> = Vec::new();
|
||||
state
|
||||
.grid
|
||||
.for_each_in_bounds(min_lat, min_lon, max_lat, max_lon, |row_idx| {
|
||||
let row = row_idx as usize;
|
||||
if cell_for_row(row) == cell_u64
|
||||
if cell_for_row(row, precomputed, h3_res, need_parent) == cell_u64
|
||||
&& row_passes_filters(
|
||||
row,
|
||||
&parsed_filters,
|
||||
|
|
|
|||
163
server-rs/src/routes/stats.rs
Normal file
163
server-rs/src/routes/stats.rs
Normal file
|
|
@ -0,0 +1,163 @@
|
|||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use rustc_hash::FxHashMap;
|
||||
|
||||
use crate::consts::MAX_PRICE_HISTORY_POINTS;
|
||||
use crate::data::FeatureStats;
|
||||
|
||||
use super::hexagon_stats::{EnumFeatureStats, HistogramStats, NumericFeatureStats, PricePoint};
|
||||
|
||||
/// Extract price history (year, price) pairs from matching rows, downsampled if needed.
|
||||
pub fn extract_price_history(
|
||||
matching_rows: &[usize],
|
||||
feature_data: &[f32],
|
||||
num_features: usize,
|
||||
feature_name_to_index: &FxHashMap<String, usize>,
|
||||
) -> Vec<PricePoint> {
|
||||
let year_idx = feature_name_to_index
|
||||
.get("Date of last transaction")
|
||||
.copied();
|
||||
let price_idx = feature_name_to_index.get("Last known price").copied();
|
||||
match (year_idx, price_idx) {
|
||||
(Some(yi), Some(pi)) => {
|
||||
let mut points: Vec<PricePoint> = matching_rows
|
||||
.iter()
|
||||
.filter_map(|&row| {
|
||||
let year = feature_data[row * num_features + yi];
|
||||
let price = feature_data[row * num_features + pi];
|
||||
if year.is_finite() && price.is_finite() {
|
||||
Some(PricePoint { year, price })
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
if points.len() > MAX_PRICE_HISTORY_POINTS {
|
||||
let step = points.len() as f64 / MAX_PRICE_HISTORY_POINTS as f64;
|
||||
points = (0..MAX_PRICE_HISTORY_POINTS)
|
||||
.map(|i| {
|
||||
let idx = (i as f64 * step) as usize;
|
||||
PricePoint {
|
||||
year: points[idx].year,
|
||||
price: points[idx].price,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
}
|
||||
points
|
||||
}
|
||||
_ => Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute per-feature stats (numeric histograms + enum counts) for the given rows.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn compute_feature_stats(
|
||||
matching_rows: &[usize],
|
||||
feature_data: &[f32],
|
||||
feature_names: &[String],
|
||||
num_features: usize,
|
||||
enum_values: &FxHashMap<usize, Vec<String>>,
|
||||
feature_stats_data: &[FeatureStats],
|
||||
fields_specified: bool,
|
||||
field_set: &HashSet<String>,
|
||||
) -> (Vec<NumericFeatureStats>, Vec<EnumFeatureStats>) {
|
||||
let mut numeric_features = Vec::new();
|
||||
let mut enum_features_out = Vec::new();
|
||||
|
||||
for (feature_index, feature_name) in feature_names.iter().enumerate() {
|
||||
if fields_specified && !field_set.contains(feature_name.as_str()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(ev) = enum_values.get(&feature_index) {
|
||||
let mut value_counts = vec![0u64; ev.len()];
|
||||
for &row in matching_rows {
|
||||
let value = feature_data[row * num_features + feature_index];
|
||||
if value.is_finite() {
|
||||
let idx = value as usize;
|
||||
if idx < value_counts.len() {
|
||||
value_counts[idx] += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let counts: HashMap<String, u64> = value_counts
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, &count)| count > 0)
|
||||
.map(|(idx, &count)| (ev[idx].clone(), count))
|
||||
.collect();
|
||||
|
||||
if !counts.is_empty() {
|
||||
enum_features_out.push(EnumFeatureStats {
|
||||
name: feature_name.clone(),
|
||||
counts,
|
||||
});
|
||||
}
|
||||
} else {
|
||||
let global_hist = &feature_stats_data[feature_index].histogram;
|
||||
let p1 = global_hist.p1;
|
||||
let p99 = global_hist.p99;
|
||||
let num_bins = global_hist.counts.len();
|
||||
|
||||
let mut count = 0usize;
|
||||
let mut min_value = f32::INFINITY;
|
||||
let mut max_value = f32::NEG_INFINITY;
|
||||
let mut sum = 0.0f64;
|
||||
let mut bins = vec![0u64; num_bins];
|
||||
|
||||
let middle_bins = num_bins.saturating_sub(2);
|
||||
let middle_width = if middle_bins > 0 && p99 > p1 {
|
||||
(p99 - p1) / middle_bins as f32
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
for &row in matching_rows {
|
||||
let value = feature_data[row * num_features + feature_index];
|
||||
if value.is_finite() {
|
||||
count += 1;
|
||||
if value < min_value {
|
||||
min_value = value;
|
||||
}
|
||||
if value > max_value {
|
||||
max_value = value;
|
||||
}
|
||||
sum += value as f64;
|
||||
|
||||
let bin = if value < p1 {
|
||||
0
|
||||
} else if value >= p99 {
|
||||
num_bins - 1
|
||||
} else if middle_width > 0.0 {
|
||||
let middle_bin = ((value - p1) / middle_width) as usize;
|
||||
(1 + middle_bin).min(num_bins - 2)
|
||||
} else {
|
||||
num_bins / 2
|
||||
};
|
||||
bins[bin] += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if count > 0 {
|
||||
numeric_features.push(NumericFeatureStats {
|
||||
name: feature_name.clone(),
|
||||
count,
|
||||
min: min_value as f64,
|
||||
max: max_value as f64,
|
||||
mean: sum / count as f64,
|
||||
histogram: HistogramStats {
|
||||
min: global_hist.min as f64,
|
||||
max: global_hist.max as f64,
|
||||
p1: p1 as f64,
|
||||
p99: p99 as f64,
|
||||
counts: bins,
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(numeric_features, enum_features_out)
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue