perfect-postcode/server-rs/src/routes/stats.rs

443 lines
16 KiB
Rust

use std::collections::{HashMap, HashSet};
use metrics::counter;
use rustc_hash::FxHashMap;
use tracing::error;
use crate::consts::PRICE_HISTORY_POINTS_LIMIT;
use crate::data::crime_by_year::CrimeByYearData;
use crate::data::{FeatureStats, PostcodePoiMetrics, PropertyData};
use super::hexagon_stats::{
CrimeYearPoint, CrimeYearStats, EnumFeatureStats, HistogramStats, NumericFeatureStats,
PricePoint,
};
/// Extract price history (year, price) pairs from matching rows, downsampled if needed.
pub fn extract_price_history(
matching_rows: &[usize],
data: &PropertyData,
feature_name_to_index: &FxHashMap<String, usize>,
) -> Vec<PricePoint> {
let year_idx = feature_name_to_index
.get("Date of last transaction")
.copied();
match year_idx {
Some(yi) => {
let mut points: Vec<PricePoint> = matching_rows
.iter()
.filter_map(|&row| {
let year = data.get_feature(row, yi);
let price = data.last_known_price_raw(row);
if year.is_finite() && price.is_finite() {
Some(PricePoint { year, price })
} else {
None
}
})
.collect();
if points.len() > PRICE_HISTORY_POINTS_LIMIT {
let step = points.len() as f64 / PRICE_HISTORY_POINTS_LIMIT as f64;
points = (0..PRICE_HISTORY_POINTS_LIMIT)
.map(|i| {
let idx = (i as f64 * step) as usize;
PricePoint {
year: points[idx].year,
price: points[idx].price,
}
})
.collect();
}
points
}
None => Vec::new(),
}
}
/// Per-feature accumulator kind, determined once before the row loop.
enum FeatureAccum {
/// Numeric: track count, min, max, sum, histogram bins.
Numeric {
count: usize,
min_value: f32,
max_value: f32,
sum: f64,
bins: Vec<u64>,
p1: f32,
p99: f32,
middle_width: f32,
num_bins: usize,
global_min: f32,
global_max: f32,
},
/// Enum: count occurrences per variant index.
Enum { value_counts: Vec<u64> },
/// Feature skipped (not in field_set).
Skip,
}
/// Compute per-feature stats (numeric histograms + enum counts) for the given rows.
/// Single-pass: iterates rows in the outer loop for cache-friendly row-major access.
#[allow(clippy::too_many_arguments)]
pub fn compute_feature_stats(
matching_rows: &[usize],
data: &PropertyData,
feature_names: &[String],
enum_values: &FxHashMap<usize, Vec<String>>,
feature_stats_data: &[FeatureStats],
fields_specified: bool,
field_set: &HashSet<String>,
) -> (Vec<NumericFeatureStats>, Vec<EnumFeatureStats>) {
let num_features = feature_names.len();
// Pre-allocate accumulators for all features
let mut accums: Vec<FeatureAccum> = (0..num_features)
.map(|fi| {
let feature_name = &feature_names[fi];
if fields_specified && !field_set.contains(feature_name.as_str()) {
return FeatureAccum::Skip;
}
if let Some(ev) = enum_values.get(&fi) {
FeatureAccum::Enum {
value_counts: vec![0u64; ev.len()],
}
} else {
let global_hist = &feature_stats_data[fi].histogram;
let p1 = global_hist.p1;
let p99 = global_hist.p99;
let num_bins = global_hist.counts.len();
let middle_bins = num_bins.saturating_sub(2);
let middle_width = if middle_bins > 0 && p99 > p1 {
(p99 - p1) / middle_bins as f32
} else {
0.0
};
FeatureAccum::Numeric {
count: 0,
min_value: f32::INFINITY,
max_value: f32::NEG_INFINITY,
sum: 0.0,
bins: vec![0u64; num_bins],
p1,
p99,
middle_width,
num_bins,
global_min: global_hist.min,
global_max: global_hist.max,
}
}
})
.collect();
// Single pass: outer loop = rows, inner loop = features (cache-friendly row-major access)
for &row in matching_rows {
for (fi, accum) in accums.iter_mut().enumerate() {
match accum {
FeatureAccum::Skip => {}
FeatureAccum::Enum { value_counts } => {
let value = data.get_feature(row, fi);
if value.is_finite() {
// Reject negatives, NaN-via-large-cast, and any out-of-range
// index. A schema/data mismatch is a critical data-integrity
// bug — skip the row, count it, and surface as error so
// monitoring catches it.
let len = value_counts.len();
let idx_ok = value >= 0.0 && (value as usize) < len;
if idx_ok {
value_counts[value as usize] += 1;
} else {
counter!("stats_enum_oob_total").increment(1);
error!(
feature = feature_names[fi].as_str(),
value,
max = len,
"Enum index out of bounds — data/schema mismatch"
);
}
}
}
FeatureAccum::Numeric {
count,
min_value,
max_value,
sum,
bins,
p1,
p99,
middle_width,
num_bins,
..
} => {
let value = data.get_feature(row, fi);
if value.is_finite() {
*count += 1;
if value < *min_value {
*min_value = value;
}
if value > *max_value {
*max_value = value;
}
*sum += value as f64;
let bin = if value < *p1 {
0
} else if value >= *p99 {
*num_bins - 1
} else if *middle_width > 0.0 {
let middle_bin = ((value - *p1) / *middle_width) as usize;
(1 + middle_bin).min(*num_bins - 2)
} else {
*num_bins / 2
};
bins[bin] += 1;
}
}
}
}
}
// Build response structs from accumulators
let mut numeric_features = Vec::new();
let mut enum_features_out = Vec::new();
for (fi, accum) in accums.into_iter().enumerate() {
match accum {
FeatureAccum::Skip => {}
FeatureAccum::Enum { value_counts } => {
let ev = &enum_values[&fi];
let counts: HashMap<String, u64> = value_counts
.iter()
.enumerate()
.filter(|(_, &count)| count > 0)
.map(|(idx, &count)| (ev[idx].clone(), count))
.collect();
if !counts.is_empty() {
enum_features_out.push(EnumFeatureStats {
name: feature_names[fi].clone(),
counts,
});
}
}
FeatureAccum::Numeric {
count,
min_value,
max_value,
sum,
bins,
p1,
p99,
global_min,
global_max,
..
} => {
if count > 0 {
numeric_features.push(NumericFeatureStats {
name: feature_names[fi].clone(),
count,
min: min_value as f64,
max: max_value as f64,
mean: sum / count as f64,
histogram: HistogramStats {
min: global_min as f64,
max: global_max as f64,
p1: p1 as f64,
p99: p99 as f64,
counts: bins,
},
});
}
}
}
}
(numeric_features, enum_features_out)
}
/// Compute property-weighted per-year crime means across the selection.
///
/// Each matching property contributes its postcode's per-year counts (incidents
/// near that postcode); this is the same property-weighted-average shape used
/// elsewhere in the right pane.
///
/// Denominators are COVERAGE-AWARE: police.uk has multi-year publication gaps
/// for whole forces (e.g. Greater Manchester from 2019-07), and the pipeline
/// emits a `covered_years` calendar per postcode. A postcode only counts toward
/// a year's denominator if its force published that year — and only then does
/// its missing bar mean a genuine zero. Years no selected postcode covers are
/// omitted entirely (charted as gaps, not zeros). Postcodes without coverage
/// info (legacy parquet without the column) count toward every year, restoring
/// the previous behaviour.
pub fn compute_crime_by_year(
matching_rows: &[usize],
data: &PropertyData,
crime_by_year: &CrimeByYearData,
fields_specified: bool,
field_set: &HashSet<String>,
) -> Vec<CrimeYearStats> {
if crime_by_year.crime_types.is_empty() || matching_rows.is_empty() {
return Vec::new();
}
let num_types = crime_by_year.crime_types.len();
let mut per_type_year_sums: Vec<FxHashMap<i32, f64>> =
(0..num_types).map(|_| FxHashMap::default()).collect();
// Per-year denominator parts: rows whose coverage calendar includes the
// year, plus rows with no calendar at all (legacy: covered everywhere).
let mut covered_counts: FxHashMap<i32, u32> = FxHashMap::default();
let mut fully_covered_rows: u32 = 0;
for &row in matching_rows {
let postcode = data.postcode(row);
match crime_by_year.covered_years_by_postcode.get(postcode) {
Some(years) => {
// An empty list (force gap for the whole window / unusable
// boundary geometry) adds nothing: the postcode's crime
// picture is unknown and must not dilute any year's mean.
for &year in years {
*covered_counts.entry(year).or_insert(0) += 1;
}
}
None => fully_covered_rows += 1,
}
// A postcode with a row but no series for a given type had no recorded
// incidents of that type: it contributes 0 to the sums, and its covered
// years still count in the denominator — a genuine zero. Uncovered
// years are excluded via the denominators instead.
if let Some(series_list) = crime_by_year.series_by_postcode.get(postcode) {
for series in series_list {
let acc = &mut per_type_year_sums[series.type_idx as usize];
for point in &series.points {
*acc.entry(point.year).or_insert(0.0) += point.count as f64;
}
}
}
}
let mut out = Vec::new();
for (type_idx, name) in crime_by_year.crime_types.iter().enumerate() {
// Crime types in the by-year side table are bare (e.g. "Burglary"), while
// the configured feature names carry an " (avg/yr)" suffix. Match either
// form so callers can pass the feature names they already know.
if fields_specified {
let with_suffix = format!("{name} (avg/yr)");
if !field_set.contains(name.as_str()) && !field_set.contains(with_suffix.as_str()) {
continue;
}
}
let years = crime_by_year
.years_by_type
.get(type_idx)
.map(Vec::as_slice)
.unwrap_or(&[]);
if years.is_empty() {
continue;
}
let sums = &per_type_year_sums[type_idx];
let points: Vec<CrimeYearPoint> = years
.iter()
.filter_map(|&year| {
let denom = fully_covered_rows
+ covered_counts.get(&year).copied().unwrap_or(0);
if denom == 0 {
// No selected postcode has published data for this year.
return None;
}
Some(CrimeYearPoint {
year,
count: (sums.get(&year).copied().unwrap_or(0.0) / denom as f64)
as f32,
})
})
.collect();
if points.is_empty() {
continue;
}
out.push(CrimeYearStats {
name: name.clone(),
points,
});
}
out
}
pub fn compute_poi_feature_stats(
matching_rows: &[usize],
poi_metrics: &PostcodePoiMetrics,
fields_specified: bool,
field_set: &HashSet<String>,
) -> Vec<NumericFeatureStats> {
let mut out = Vec::new();
for (metric_idx, name) in poi_metrics.feature_names.iter().enumerate() {
if fields_specified && !field_set.contains(name.as_str()) {
continue;
}
let global_hist = &poi_metrics.feature_stats[metric_idx].histogram;
let p1 = global_hist.p1;
let p99 = global_hist.p99;
let num_bins = global_hist.counts.len();
let middle_bins = num_bins.saturating_sub(2);
let middle_width = if middle_bins > 0 && p99 > p1 {
(p99 - p1) / middle_bins as f32
} else {
0.0
};
let mut count = 0usize;
let mut min_value = f32::INFINITY;
let mut max_value = f32::NEG_INFINITY;
let mut sum = 0.0f64;
let mut bins = vec![0u64; num_bins];
for &row in matching_rows {
let value = poi_metrics.get_for_property_row(row, metric_idx);
if !value.is_finite() {
continue;
}
count += 1;
if value < min_value {
min_value = value;
}
if value > max_value {
max_value = value;
}
sum += value as f64;
let bin = if value < p1 {
0
} else if value >= p99 {
num_bins - 1
} else if middle_width > 0.0 {
let middle_bin = ((value - p1) / middle_width) as usize;
(1 + middle_bin).min(num_bins - 2)
} else {
num_bins / 2
};
bins[bin] += 1;
}
if count > 0 {
out.push(NumericFeatureStats {
name: name.clone(),
count,
min: min_value as f64,
max: max_value as f64,
mean: sum / count as f64,
histogram: HistogramStats {
min: global_hist.min as f64,
max: global_hist.max as f64,
p1: p1 as f64,
p99: p99 as f64,
counts: bins,
},
});
}
}
out
}