use std::collections::{HashMap, HashSet}; use metrics::counter; use rustc_hash::FxHashMap; use tracing::error; use crate::consts::PRICE_HISTORY_POINTS_LIMIT; use crate::data::crime_by_year::CrimeByYearData; use crate::data::{FeatureStats, PostcodePoiMetrics, PropertyData}; use super::hexagon_stats::{ CrimeYearPoint, CrimeYearStats, EnumFeatureStats, HistogramStats, NumericFeatureStats, PricePoint, }; /// Extract price history (year, price) pairs from matching rows, downsampled if needed. pub fn extract_price_history( matching_rows: &[usize], data: &PropertyData, feature_name_to_index: &FxHashMap, ) -> Vec { let year_idx = feature_name_to_index .get("Date of last transaction") .copied(); match year_idx { Some(yi) => { let mut points: Vec = matching_rows .iter() .filter_map(|&row| { let year = data.get_feature(row, yi); let price = data.last_known_price_raw(row); if year.is_finite() && price.is_finite() { Some(PricePoint { year, price }) } else { None } }) .collect(); if points.len() > PRICE_HISTORY_POINTS_LIMIT { let step = points.len() as f64 / PRICE_HISTORY_POINTS_LIMIT as f64; points = (0..PRICE_HISTORY_POINTS_LIMIT) .map(|i| { let idx = (i as f64 * step) as usize; PricePoint { year: points[idx].year, price: points[idx].price, } }) .collect(); } points } None => Vec::new(), } } /// Per-feature accumulator kind, determined once before the row loop. enum FeatureAccum { /// Numeric: track count, min, max, sum, histogram bins. Numeric { count: usize, min_value: f32, max_value: f32, sum: f64, bins: Vec, p1: f32, p99: f32, middle_width: f32, num_bins: usize, global_min: f32, global_max: f32, }, /// Enum: count occurrences per variant index. Enum { value_counts: Vec }, /// Feature skipped (not in field_set). Skip, } /// Compute per-feature stats (numeric histograms + enum counts) for the given rows. /// Single-pass: iterates rows in the outer loop for cache-friendly row-major access. #[allow(clippy::too_many_arguments)] pub fn compute_feature_stats( matching_rows: &[usize], data: &PropertyData, feature_names: &[String], enum_values: &FxHashMap>, feature_stats_data: &[FeatureStats], fields_specified: bool, field_set: &HashSet, ) -> (Vec, Vec) { let num_features = feature_names.len(); // Pre-allocate accumulators for all features let mut accums: Vec = (0..num_features) .map(|fi| { let feature_name = &feature_names[fi]; if fields_specified && !field_set.contains(feature_name.as_str()) { return FeatureAccum::Skip; } if let Some(ev) = enum_values.get(&fi) { FeatureAccum::Enum { value_counts: vec![0u64; ev.len()], } } else { let global_hist = &feature_stats_data[fi].histogram; let p1 = global_hist.p1; let p99 = global_hist.p99; let num_bins = global_hist.counts.len(); let middle_bins = num_bins.saturating_sub(2); let middle_width = if middle_bins > 0 && p99 > p1 { (p99 - p1) / middle_bins as f32 } else { 0.0 }; FeatureAccum::Numeric { count: 0, min_value: f32::INFINITY, max_value: f32::NEG_INFINITY, sum: 0.0, bins: vec![0u64; num_bins], p1, p99, middle_width, num_bins, global_min: global_hist.min, global_max: global_hist.max, } } }) .collect(); // Single pass: outer loop = rows, inner loop = features (cache-friendly row-major access) for &row in matching_rows { for (fi, accum) in accums.iter_mut().enumerate() { match accum { FeatureAccum::Skip => {} FeatureAccum::Enum { value_counts } => { let value = data.get_feature(row, fi); if value.is_finite() { // Reject negatives, NaN-via-large-cast, and any out-of-range // index. A schema/data mismatch is a critical data-integrity // bug — skip the row, count it, and surface as error so // monitoring catches it. let len = value_counts.len(); let idx_ok = value >= 0.0 && (value as usize) < len; if idx_ok { value_counts[value as usize] += 1; } else { counter!("stats_enum_oob_total").increment(1); error!( feature = feature_names[fi].as_str(), value, max = len, "Enum index out of bounds — data/schema mismatch" ); } } } FeatureAccum::Numeric { count, min_value, max_value, sum, bins, p1, p99, middle_width, num_bins, .. } => { let value = data.get_feature(row, fi); if value.is_finite() { *count += 1; if value < *min_value { *min_value = value; } if value > *max_value { *max_value = value; } *sum += value as f64; let bin = if value < *p1 { 0 } else if value >= *p99 { *num_bins - 1 } else if *middle_width > 0.0 { let middle_bin = ((value - *p1) / *middle_width) as usize; (1 + middle_bin).min(*num_bins - 2) } else { *num_bins / 2 }; bins[bin] += 1; } } } } } // Build response structs from accumulators let mut numeric_features = Vec::new(); let mut enum_features_out = Vec::new(); for (fi, accum) in accums.into_iter().enumerate() { match accum { FeatureAccum::Skip => {} FeatureAccum::Enum { value_counts } => { let ev = &enum_values[&fi]; let counts: HashMap = value_counts .iter() .enumerate() .filter(|(_, &count)| count > 0) .map(|(idx, &count)| (ev[idx].clone(), count)) .collect(); if !counts.is_empty() { enum_features_out.push(EnumFeatureStats { name: feature_names[fi].clone(), counts, }); } } FeatureAccum::Numeric { count, min_value, max_value, sum, bins, p1, p99, global_min, global_max, .. } => { if count > 0 { numeric_features.push(NumericFeatureStats { name: feature_names[fi].clone(), count, min: min_value as f64, max: max_value as f64, mean: sum / count as f64, histogram: HistogramStats { min: global_min as f64, max: global_max as f64, p1: p1 as f64, p99: p99 as f64, counts: bins, }, }); } } } } (numeric_features, enum_features_out) } /// Compute property-weighted per-year crime means across the selection. /// /// Each matching property contributes its postcode's per-year counts (incidents /// near that postcode); this is the same property-weighted-average shape used /// elsewhere in the right pane. /// /// Denominators are COVERAGE-AWARE: police.uk has multi-year publication gaps /// for whole forces (e.g. Greater Manchester from 2019-07), and the pipeline /// emits a `covered_years` calendar per postcode. A postcode only counts toward /// a year's denominator if its force published that year — and only then does /// its missing bar mean a genuine zero. Years no selected postcode covers are /// omitted entirely (charted as gaps, not zeros). Postcodes without coverage /// info (legacy parquet without the column) count toward every year, restoring /// the previous behaviour. pub fn compute_crime_by_year( matching_rows: &[usize], data: &PropertyData, crime_by_year: &CrimeByYearData, fields_specified: bool, field_set: &HashSet, ) -> Vec { if crime_by_year.crime_types.is_empty() || matching_rows.is_empty() { return Vec::new(); } let num_types = crime_by_year.crime_types.len(); let mut per_type_year_sums: Vec> = (0..num_types).map(|_| FxHashMap::default()).collect(); // Per-year denominator parts: rows whose coverage calendar includes the // year, plus rows with no calendar at all (legacy: covered everywhere). let mut covered_counts: FxHashMap = FxHashMap::default(); let mut fully_covered_rows: u32 = 0; for &row in matching_rows { let postcode = data.postcode(row); match crime_by_year.covered_years_by_postcode.get(postcode) { Some(years) => { // An empty list (force gap for the whole window / unusable // boundary geometry) adds nothing: the postcode's crime // picture is unknown and must not dilute any year's mean. for &year in years { *covered_counts.entry(year).or_insert(0) += 1; } } None => fully_covered_rows += 1, } // A postcode with a row but no series for a given type had no recorded // incidents of that type: it contributes 0 to the sums, and its covered // years still count in the denominator — a genuine zero. Uncovered // years are excluded via the denominators instead. if let Some(series_list) = crime_by_year.series_by_postcode.get(postcode) { for series in series_list { let acc = &mut per_type_year_sums[series.type_idx as usize]; for point in &series.points { *acc.entry(point.year).or_insert(0.0) += point.count as f64; } } } } let mut out = Vec::new(); for (type_idx, name) in crime_by_year.crime_types.iter().enumerate() { // Crime types in the by-year side table are bare (e.g. "Burglary"), while // the configured feature names carry an " (avg/yr)" suffix. Match either // form so callers can pass the feature names they already know. if fields_specified { let with_suffix = format!("{name} (avg/yr)"); if !field_set.contains(name.as_str()) && !field_set.contains(with_suffix.as_str()) { continue; } } let years = crime_by_year .years_by_type .get(type_idx) .map(Vec::as_slice) .unwrap_or(&[]); if years.is_empty() { continue; } let sums = &per_type_year_sums[type_idx]; let points: Vec = years .iter() .filter_map(|&year| { let denom = fully_covered_rows + covered_counts.get(&year).copied().unwrap_or(0); if denom == 0 { // No selected postcode has published data for this year. return None; } Some(CrimeYearPoint { year, count: (sums.get(&year).copied().unwrap_or(0.0) / denom as f64) as f32, }) }) .collect(); if points.is_empty() { continue; } out.push(CrimeYearStats { name: name.clone(), points, }); } out } pub fn compute_poi_feature_stats( matching_rows: &[usize], poi_metrics: &PostcodePoiMetrics, fields_specified: bool, field_set: &HashSet, ) -> Vec { let mut out = Vec::new(); for (metric_idx, name) in poi_metrics.feature_names.iter().enumerate() { if fields_specified && !field_set.contains(name.as_str()) { continue; } let global_hist = &poi_metrics.feature_stats[metric_idx].histogram; let p1 = global_hist.p1; let p99 = global_hist.p99; let num_bins = global_hist.counts.len(); let middle_bins = num_bins.saturating_sub(2); let middle_width = if middle_bins > 0 && p99 > p1 { (p99 - p1) / middle_bins as f32 } else { 0.0 }; let mut count = 0usize; let mut min_value = f32::INFINITY; let mut max_value = f32::NEG_INFINITY; let mut sum = 0.0f64; let mut bins = vec![0u64; num_bins]; for &row in matching_rows { let value = poi_metrics.get_for_property_row(row, metric_idx); if !value.is_finite() { continue; } count += 1; if value < min_value { min_value = value; } if value > max_value { max_value = value; } sum += value as f64; let bin = if value < p1 { 0 } else if value >= p99 { num_bins - 1 } else if middle_width > 0.0 { let middle_bin = ((value - p1) / middle_width) as usize; (1 + middle_bin).min(num_bins - 2) } else { num_bins / 2 }; bins[bin] += 1; } if count > 0 { out.push(NumericFeatureStats { name: name.clone(), count, min: min_value as f64, max: max_value as f64, mean: sum / count as f64, histogram: HistogramStats { min: global_hist.min as f64, max: global_hist.max as f64, p1: p1 as f64, p99: p99 as f64, counts: bins, }, }); } } out }