//! Feature statistics: outlier-bracketed histograms, percentile estimation and //! slider-bound computation. use anyhow::Context; use polars::prelude::*; use serde::Serialize; use crate::consts::HISTOGRAM_BINS; use crate::features::Bounds; /// Histogram with outlier buckets at the edges. /// - Bin 0: [min, p1) — low outliers /// - Bins 1 to n-2: [p1, p99) — main distribution, evenly divided /// - Bin n-1: [p99, max] — high outliers #[derive(Serialize, Clone)] pub struct Histogram { pub min: f32, pub max: f32, /// 1st percentile (left edge of main distribution) pub p1: f32, /// 99th percentile (right edge of main distribution) pub p99: f32, pub counts: Vec, } impl Histogram { /// Return the bin index for a given value using the outlier-bracket layout. #[cfg(test)] pub fn bin_for_value(&self, value: f32) -> usize { let num_bins = self.counts.len(); if value < self.p1 { 0 } else if value >= self.p99 { num_bins - 1 } else { let middle_bins = num_bins.saturating_sub(2); if middle_bins > 0 && self.p99 > self.p1 { let width = (self.p99 - self.p1) / middle_bins as f32; let middle_bin = ((value - self.p1) / width) as usize; (1 + middle_bin).min(num_bins - 2) } else { num_bins / 2 } } } /// Width of a single middle bin (bins 1..n-2). #[cfg(test)] pub fn middle_bin_width(&self) -> f32 { let middle_bins = self.counts.len().saturating_sub(2); if middle_bins > 0 && self.p99 > self.p1 { (self.p99 - self.p1) / middle_bins as f32 } else { 0.0 } } } pub struct FeatureStats { pub slider_min: f32, pub slider_max: f32, pub histogram: Histogram, } /// Compute a percentile from a uniformly-binned histogram. /// `prelim_counts` are uniform bins over [min, max]. fn percentile_from_uniform_histogram( count: usize, min: f32, max: f32, prelim_counts: &[u64], percentile: f32, ) -> f32 { if count == 0 || prelim_counts.is_empty() { return min; } let target = (count as f64 * percentile as f64 / 100.0).floor() as u64; let bin_width = (max - min) / prelim_counts.len() as f32; let mut cumulative = 0u64; for (i, &bin_count) in prelim_counts.iter().enumerate() { let prev_cumulative = cumulative; cumulative += bin_count; if cumulative > target { // Interpolate within this bin let bin_start = min + i as f32 * bin_width; let fraction = if bin_count > 0 { (target - prev_cumulative) as f32 / bin_count as f32 } else { 0.0 }; return bin_start + fraction * bin_width; } } max } /// Build a histogram and compute slider bounds based on the feature's Bounds config. pub fn compute_feature_stats(vals: &[f32], bounds: &Bounds, integer_bins: bool) -> FeatureStats { // Single pass: min, max, count (skipping NaN and infinity) let mut min = f32::INFINITY; let mut max = f32::NEG_INFINITY; let mut count = 0usize; for &value in vals { if value.is_finite() { if value < min { min = value; } if value > max { max = value; } count += 1; } } if count == 0 { let (slider_min, slider_max) = match bounds { Bounds::Fixed { min: fmin, max: fmax, } => (*fmin, *fmax), Bounds::Percentile { .. } => (0.0, 0.0), }; return FeatureStats { slider_min, slider_max, histogram: Histogram { min: 0.0, max: 0.0, p1: 0.0, p99: 0.0, counts: vec![0; HISTOGRAM_BINS], }, }; } // Build preliminary histogram with uniform bins to compute percentiles // Use full HISTOGRAM_BINS for percentile precision let range = if max == min { 1.0 } else { max - min }; let prelim_max = min + range * (1.0 + 1e-6); let prelim_bin_width = (prelim_max - min) / HISTOGRAM_BINS as f32; let mut prelim_counts = vec![0u64; HISTOGRAM_BINS]; for &value in vals { if value.is_finite() { let bin = ((value - min) / prelim_bin_width) as usize; prelim_counts[bin.min(HISTOGRAM_BINS - 1)] += 1; } } // Compute p1 and p99 from preliminary histogram let mut p1 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 1.0); let mut p99 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 99.0); // Iterative refinement for outlier-dominated distributions. // When extreme outliers (e.g. 317M sqm from web scraping) dominate the range, // the uniform histogram puts all real data in one bin, making percentile // estimation useless. Zoom into the estimated data region and recompute. let mut refined_counts = prelim_counts; let mut refined_count = count; let mut refined_min = min; let mut refined_max = max; for _ in 0..3 { let iqr = p99 - p1; if iqr <= 0.0 || (refined_max - refined_min) <= 5.0 * iqr { break; } let new_min = (p1 - iqr).max(min); let new_max = p99 + iqr; if new_max <= new_min { break; } let bin_width = (new_max - new_min) / HISTOGRAM_BINS as f32; let mut counts = vec![0u64; HISTOGRAM_BINS]; let mut cnt = 0usize; for &value in vals { if value.is_finite() && value >= new_min && value <= new_max { let bin = ((value - new_min) / bin_width) as usize; counts[bin.min(HISTOGRAM_BINS - 1)] += 1; cnt += 1; } } if cnt == 0 { break; } p1 = percentile_from_uniform_histogram(cnt, new_min, new_max, &counts, 1.0); p99 = percentile_from_uniform_histogram(cnt, new_min, new_max, &counts, 99.0); refined_counts = counts; refined_count = cnt; refined_min = new_min; refined_max = new_max; } // For integer-binned features, snap p1/p99 to integer boundaries // so each middle bin is exactly 1 unit wide. if integer_bins { p1 = p1.floor(); p99 = p99.ceil(); } // Determine number of histogram bins let num_bins = if integer_bins && p99 > p1 { // One middle bin per integer + 2 outlier bins (p99 - p1) as usize + 2 } else { // Count unique values within the p1–p99 range to cap histogram bins. // Using the full-range cardinality would over-allocate bins when outliers // inflate it (e.g. bedrooms: 1–137 unique values but only ~10 within p1–p99). let cardinality = { let mut unique_set = rustc_hash::FxHashSet::default(); for &val in vals { if val.is_finite() && val >= p1 && val <= p99 { unique_set.insert(val.to_bits()); } } unique_set.len() }; HISTOGRAM_BINS.min(cardinality).max(3) }; // Build final histogram with outlier bins at edges: // - Bin 0: [min, p1) — low outliers // - Bins 1 to n-2: [p1, p99) — main distribution, evenly divided // - Bin n-1: [p99, max] — high outliers let mut counts = vec![0u64; num_bins]; let middle_bins = num_bins.saturating_sub(2); let middle_width = if middle_bins > 0 && p99 > p1 { (p99 - p1) / middle_bins as f32 } else { 0.0 }; for &value in vals { if value.is_finite() { let bin = if value < p1 { 0 // Low outlier bin } else if value >= p99 { num_bins - 1 // High outlier bin } else if middle_width > 0.0 { // Middle bins (1 to n-2) let middle_bin = ((value - p1) / middle_width) as usize; (1 + middle_bin).min(num_bins - 2) } else { num_bins / 2 // Fallback if p1 == p99 }; counts[bin] += 1; } } let histogram = Histogram { min: refined_min, max: refined_max, p1, p99, counts, }; // Compute slider bounds (use refined histogram for accurate percentiles) let (slider_min, slider_max) = match bounds { Bounds::Fixed { min: fmin, max: fmax, } => (*fmin, *fmax), Bounds::Percentile { low, high } => { let p_low = percentile_from_uniform_histogram( refined_count, refined_min, refined_max, &refined_counts, *low as f32, ); let p_high = percentile_from_uniform_histogram( refined_count, refined_min, refined_max, &refined_counts, *high as f32, ); (p_low, p_high) } }; FeatureStats { slider_min, slider_max, histogram, } } pub(super) fn column_to_f32_vec(column: &Column) -> anyhow::Result> { let float_series = column .cast(&DataType::Float32) .context("Failed to cast column to Float32")?; let chunked = float_series .f32() .context("Failed to get f32 chunked array")?; Ok(chunked .into_iter() .map(|value| value.unwrap_or(f32::NAN)) .collect()) } #[cfg(test)] mod tests { use super::*; use crate::consts::QUANT_SCALE; use crate::features::Bounds; fn make_fixed_bounds(min: f32, max: f32) -> Bounds { Bounds::Fixed { min, max } } fn make_percentile_bounds(low: f64, high: f64) -> Bounds { Bounds::Percentile { low, high } } #[test] fn histogram_empty_data() { let data: Vec = vec![]; let bounds = make_fixed_bounds(0.0, 100.0); let stats = compute_feature_stats(&data, &bounds, false); assert_eq!(stats.slider_min, 0.0); assert_eq!(stats.slider_max, 100.0); assert_eq!(stats.histogram.counts.iter().sum::(), 0); } #[test] fn histogram_single_value() { let data = vec![50.0_f32]; let bounds = make_fixed_bounds(0.0, 100.0); let stats = compute_feature_stats(&data, &bounds, false); assert_eq!(stats.histogram.min, 50.0); assert_eq!(stats.histogram.max, 50.0); assert_eq!(stats.histogram.counts.iter().sum::(), 1); } #[test] fn histogram_uniform_distribution() { let data: Vec = (0..100).map(|i| i as f32).collect(); let bounds = make_fixed_bounds(0.0, 100.0); let stats = compute_feature_stats(&data, &bounds, false); assert_eq!(stats.histogram.min, 0.0); assert_eq!(stats.histogram.max, 99.0); assert_eq!(stats.histogram.counts.iter().sum::(), 100); } #[test] fn histogram_with_nan_values() { let data = vec![10.0_f32, f32::NAN, 20.0, f32::NAN, 30.0]; let bounds = make_fixed_bounds(0.0, 100.0); let stats = compute_feature_stats(&data, &bounds, false); assert_eq!(stats.histogram.counts.iter().sum::(), 3); assert_eq!(stats.histogram.min, 10.0); assert_eq!(stats.histogram.max, 30.0); } #[test] fn histogram_all_nan() { let data = vec![f32::NAN, f32::NAN, f32::NAN]; let bounds = make_fixed_bounds(0.0, 100.0); let stats = compute_feature_stats(&data, &bounds, false); assert_eq!(stats.histogram.counts.iter().sum::(), 0); } #[test] fn histogram_all_same_value() { let data = vec![42.0_f32; 1000]; let bounds = make_fixed_bounds(0.0, 100.0); let stats = compute_feature_stats(&data, &bounds, false); assert_eq!(stats.histogram.min, 42.0); assert_eq!(stats.histogram.max, 42.0); assert_eq!(stats.histogram.p1, 42.0); assert_eq!(stats.histogram.p99, 42.0); assert_eq!(stats.histogram.counts.iter().sum::(), 1000); } #[test] fn histogram_percentile_bounds() { let mut data: Vec = vec![0.0]; // Low outlier data.extend((1..99).map(|i| 50.0 + i as f32 * 0.01)); data.push(1000.0); // High outlier let bounds = make_percentile_bounds(2.0, 98.0); let stats = compute_feature_stats(&data, &bounds, false); assert!(stats.slider_min > 0.0); assert!(stats.slider_max < 1000.0); } #[test] fn fixed_price_bounds_keep_slider_cap() { let data = vec![400_000.0_f32, 2_500_000.0, 3_750_000.0]; let bounds = make_fixed_bounds(0.0, 2_500_000.0); let stats = compute_feature_stats(&data, &bounds, false); assert_eq!(stats.slider_min, 0.0); assert_eq!(stats.slider_max, 2_500_000.0); } #[test] fn histogram_bin_for_value() { let hist = Histogram { min: 0.0, max: 100.0, p1: 10.0, p99: 90.0, counts: vec![0; 10], }; assert_eq!(hist.bin_for_value(5.0), 0); // Low outlier bin assert_eq!(hist.bin_for_value(95.0), 9); // High outlier bin let mid_value = 50.0; let bin = hist.bin_for_value(mid_value); assert!((1..=8).contains(&bin)); } #[test] fn histogram_middle_bin_width() { let hist = Histogram { min: 0.0, max: 100.0, p1: 10.0, p99: 90.0, counts: vec![0; 10], }; let expected_width = (90.0 - 10.0) / 8.0; assert!((hist.middle_bin_width() - expected_width).abs() < 0.001); } #[test] fn histogram_cardinality_caps_bins() { let data = vec![1.0_f32, 1.0, 2.0, 2.0, 3.0, 3.0]; let bounds = make_fixed_bounds(0.0, 100.0); let stats = compute_feature_stats(&data, &bounds, false); assert_eq!(stats.histogram.counts.len(), 3); } #[test] fn min_max_skips_nan() { let values = vec![10.0_f32, f32::NAN, 20.0, f32::NAN, 5.0]; let mut min = f32::INFINITY; let mut max = f32::NEG_INFINITY; for &v in &values { if v.is_finite() { if v < min { min = v; } if v > max { max = v; } } } assert_eq!(min, 5.0); assert_eq!(max, 20.0); } #[test] fn count_skips_nan() { let values = [1.0_f32, f32::NAN, 2.0, f32::NAN, 3.0]; let count = values.iter().filter(|v| v.is_finite()).count(); assert_eq!(count, 3); } #[test] fn infinity_values_excluded() { let data = vec![f32::INFINITY, f32::NEG_INFINITY, 50.0]; let bounds = Bounds::Fixed { min: 0.0, max: 100.0, }; let stats = compute_feature_stats(&data, &bounds, false); assert_eq!(stats.histogram.min, 50.0); assert_eq!(stats.histogram.max, 50.0); assert_eq!(stats.histogram.counts.iter().sum::(), 1); } #[test] fn only_finite_values() { let data = vec![10.0_f32, 20.0, 30.0]; let bounds = Bounds::Fixed { min: 0.0, max: 100.0, }; let stats = compute_feature_stats(&data, &bounds, false); assert_eq!(stats.histogram.min, 10.0); assert_eq!(stats.histogram.max, 30.0); assert_eq!(stats.histogram.counts.iter().sum::(), 3); } #[test] fn extreme_outlier_does_not_destroy_quantization() { // Simulate floor area: 10k normal values (50-200 sqm) + one 317M outlier let mut data: Vec = (0..10_000).map(|i| 50.0 + (i % 150) as f32).collect(); data.push(317_000_000.0); // Extreme outlier from web scraping let bounds = make_percentile_bounds(0.0, 98.0); let stats = compute_feature_stats(&data, &bounds, false); // After refinement, histogram range should be much tighter than 317M assert!( stats.histogram.max < 1_000_000.0, "histogram.max should be refined, got {}", stats.histogram.max, ); // p1 should be near 50, not millions assert!( stats.histogram.p1 < 100.0, "p1 should be near real data, got {}", stats.histogram.p1, ); // Slider min should reflect actual data range assert!( stats.slider_min < 100.0, "slider_min should be near real data, got {}", stats.slider_min, ); // Quantization using histogram.min/max should give usable range let qmin = stats.histogram.min; let qrange = stats.histogram.max - stats.histogram.min; assert!(qrange > 0.0 && qrange < 1_000_000.0); // A typical floor area (100 sqm) should be distinguishable from min let normalized = (100.0 - qmin) / qrange; let encoded = (normalized * QUANT_SCALE).round() as u16; assert!( encoded > 100, "100 sqm should encode to a meaningful u16 value, got {}", encoded, ); } }