perfect-postcode/server-rs/src/data/property/stats.rs
Andras Schmelczer f59d01227b
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 15s
CI / Check (push) Failing after 1m58s
SPlit up
2026-06-12 21:51:37 +01:00

544 lines
17 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! Feature statistics: outlier-bracketed histograms, percentile estimation and
//! slider-bound computation.
use anyhow::Context;
use polars::prelude::*;
use serde::Serialize;
use crate::consts::HISTOGRAM_BINS;
use crate::features::Bounds;
/// Histogram with outlier buckets at the edges.
/// - Bin 0: [min, p1) — low outliers
/// - Bins 1 to n-2: [p1, p99) — main distribution, evenly divided
/// - Bin n-1: [p99, max] — high outliers
#[derive(Serialize, Clone)]
pub struct Histogram {
pub min: f32,
pub max: f32,
/// 1st percentile (left edge of main distribution)
pub p1: f32,
/// 99th percentile (right edge of main distribution)
pub p99: f32,
pub counts: Vec<u64>,
}
impl Histogram {
/// Return the bin index for a given value using the outlier-bracket layout.
#[cfg(test)]
pub fn bin_for_value(&self, value: f32) -> usize {
let num_bins = self.counts.len();
if value < self.p1 {
0
} else if value >= self.p99 {
num_bins - 1
} else {
let middle_bins = num_bins.saturating_sub(2);
if middle_bins > 0 && self.p99 > self.p1 {
let width = (self.p99 - self.p1) / middle_bins as f32;
let middle_bin = ((value - self.p1) / width) as usize;
(1 + middle_bin).min(num_bins - 2)
} else {
num_bins / 2
}
}
}
/// Width of a single middle bin (bins 1..n-2).
#[cfg(test)]
pub fn middle_bin_width(&self) -> f32 {
let middle_bins = self.counts.len().saturating_sub(2);
if middle_bins > 0 && self.p99 > self.p1 {
(self.p99 - self.p1) / middle_bins as f32
} else {
0.0
}
}
}
pub struct FeatureStats {
pub slider_min: f32,
pub slider_max: f32,
pub histogram: Histogram,
}
/// Compute a percentile from a uniformly-binned histogram.
/// `prelim_counts` are uniform bins over [min, max].
fn percentile_from_uniform_histogram(
count: usize,
min: f32,
max: f32,
prelim_counts: &[u64],
percentile: f32,
) -> f32 {
if count == 0 || prelim_counts.is_empty() {
return min;
}
let target = (count as f64 * percentile as f64 / 100.0).floor() as u64;
let bin_width = (max - min) / prelim_counts.len() as f32;
let mut cumulative = 0u64;
for (i, &bin_count) in prelim_counts.iter().enumerate() {
let prev_cumulative = cumulative;
cumulative += bin_count;
if cumulative > target {
// Interpolate within this bin
let bin_start = min + i as f32 * bin_width;
let fraction = if bin_count > 0 {
(target - prev_cumulative) as f32 / bin_count as f32
} else {
0.0
};
return bin_start + fraction * bin_width;
}
}
max
}
/// Build a histogram and compute slider bounds based on the feature's Bounds config.
pub fn compute_feature_stats(vals: &[f32], bounds: &Bounds, integer_bins: bool) -> FeatureStats {
// Single pass: min, max, count (skipping NaN and infinity)
let mut min = f32::INFINITY;
let mut max = f32::NEG_INFINITY;
let mut count = 0usize;
for &value in vals {
if value.is_finite() {
if value < min {
min = value;
}
if value > max {
max = value;
}
count += 1;
}
}
if count == 0 {
let (slider_min, slider_max) = match bounds {
Bounds::Fixed {
min: fmin,
max: fmax,
} => (*fmin, *fmax),
Bounds::Percentile { .. } => (0.0, 0.0),
};
return FeatureStats {
slider_min,
slider_max,
histogram: Histogram {
min: 0.0,
max: 0.0,
p1: 0.0,
p99: 0.0,
counts: vec![0; HISTOGRAM_BINS],
},
};
}
// Build preliminary histogram with uniform bins to compute percentiles
// Use full HISTOGRAM_BINS for percentile precision
let range = if max == min { 1.0 } else { max - min };
let prelim_max = min + range * (1.0 + 1e-6);
let prelim_bin_width = (prelim_max - min) / HISTOGRAM_BINS as f32;
let mut prelim_counts = vec![0u64; HISTOGRAM_BINS];
for &value in vals {
if value.is_finite() {
let bin = ((value - min) / prelim_bin_width) as usize;
prelim_counts[bin.min(HISTOGRAM_BINS - 1)] += 1;
}
}
// Compute p1 and p99 from preliminary histogram
let mut p1 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 1.0);
let mut p99 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 99.0);
// Iterative refinement for outlier-dominated distributions.
// When extreme outliers (e.g. 317M sqm from web scraping) dominate the range,
// the uniform histogram puts all real data in one bin, making percentile
// estimation useless. Zoom into the estimated data region and recompute.
let mut refined_counts = prelim_counts;
let mut refined_count = count;
let mut refined_min = min;
let mut refined_max = max;
for _ in 0..3 {
let iqr = p99 - p1;
if iqr <= 0.0 || (refined_max - refined_min) <= 5.0 * iqr {
break;
}
let new_min = (p1 - iqr).max(min);
let new_max = p99 + iqr;
if new_max <= new_min {
break;
}
let bin_width = (new_max - new_min) / HISTOGRAM_BINS as f32;
let mut counts = vec![0u64; HISTOGRAM_BINS];
let mut cnt = 0usize;
for &value in vals {
if value.is_finite() && value >= new_min && value <= new_max {
let bin = ((value - new_min) / bin_width) as usize;
counts[bin.min(HISTOGRAM_BINS - 1)] += 1;
cnt += 1;
}
}
if cnt == 0 {
break;
}
p1 = percentile_from_uniform_histogram(cnt, new_min, new_max, &counts, 1.0);
p99 = percentile_from_uniform_histogram(cnt, new_min, new_max, &counts, 99.0);
refined_counts = counts;
refined_count = cnt;
refined_min = new_min;
refined_max = new_max;
}
// For integer-binned features, snap p1/p99 to integer boundaries
// so each middle bin is exactly 1 unit wide.
if integer_bins {
p1 = p1.floor();
p99 = p99.ceil();
}
// Determine number of histogram bins
let num_bins = if integer_bins && p99 > p1 {
// One middle bin per integer + 2 outlier bins
(p99 - p1) as usize + 2
} else {
// Count unique values within the p1p99 range to cap histogram bins.
// Using the full-range cardinality would over-allocate bins when outliers
// inflate it (e.g. bedrooms: 1137 unique values but only ~10 within p1p99).
let cardinality = {
let mut unique_set = rustc_hash::FxHashSet::default();
for &val in vals {
if val.is_finite() && val >= p1 && val <= p99 {
unique_set.insert(val.to_bits());
}
}
unique_set.len()
};
HISTOGRAM_BINS.min(cardinality).max(3)
};
// Build final histogram with outlier bins at edges:
// - Bin 0: [min, p1) — low outliers
// - Bins 1 to n-2: [p1, p99) — main distribution, evenly divided
// - Bin n-1: [p99, max] — high outliers
let mut counts = vec![0u64; num_bins];
let middle_bins = num_bins.saturating_sub(2);
let middle_width = if middle_bins > 0 && p99 > p1 {
(p99 - p1) / middle_bins as f32
} else {
0.0
};
for &value in vals {
if value.is_finite() {
let bin = if value < p1 {
0 // Low outlier bin
} else if value >= p99 {
num_bins - 1 // High outlier bin
} else if middle_width > 0.0 {
// Middle bins (1 to n-2)
let middle_bin = ((value - p1) / middle_width) as usize;
(1 + middle_bin).min(num_bins - 2)
} else {
num_bins / 2 // Fallback if p1 == p99
};
counts[bin] += 1;
}
}
let histogram = Histogram {
min: refined_min,
max: refined_max,
p1,
p99,
counts,
};
// Compute slider bounds (use refined histogram for accurate percentiles)
let (slider_min, slider_max) = match bounds {
Bounds::Fixed {
min: fmin,
max: fmax,
} => (*fmin, *fmax),
Bounds::Percentile { low, high } => {
let p_low = percentile_from_uniform_histogram(
refined_count,
refined_min,
refined_max,
&refined_counts,
*low as f32,
);
let p_high = percentile_from_uniform_histogram(
refined_count,
refined_min,
refined_max,
&refined_counts,
*high as f32,
);
(p_low, p_high)
}
};
FeatureStats {
slider_min,
slider_max,
histogram,
}
}
pub(super) fn column_to_f32_vec(column: &Column) -> anyhow::Result<Vec<f32>> {
let float_series = column
.cast(&DataType::Float32)
.context("Failed to cast column to Float32")?;
let chunked = float_series
.f32()
.context("Failed to get f32 chunked array")?;
Ok(chunked
.into_iter()
.map(|value| value.unwrap_or(f32::NAN))
.collect())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::consts::QUANT_SCALE;
use crate::features::Bounds;
fn make_fixed_bounds(min: f32, max: f32) -> Bounds {
Bounds::Fixed { min, max }
}
fn make_percentile_bounds(low: f64, high: f64) -> Bounds {
Bounds::Percentile { low, high }
}
#[test]
fn histogram_empty_data() {
let data: Vec<f32> = vec![];
let bounds = make_fixed_bounds(0.0, 100.0);
let stats = compute_feature_stats(&data, &bounds, false);
assert_eq!(stats.slider_min, 0.0);
assert_eq!(stats.slider_max, 100.0);
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 0);
}
#[test]
fn histogram_single_value() {
let data = vec![50.0_f32];
let bounds = make_fixed_bounds(0.0, 100.0);
let stats = compute_feature_stats(&data, &bounds, false);
assert_eq!(stats.histogram.min, 50.0);
assert_eq!(stats.histogram.max, 50.0);
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 1);
}
#[test]
fn histogram_uniform_distribution() {
let data: Vec<f32> = (0..100).map(|i| i as f32).collect();
let bounds = make_fixed_bounds(0.0, 100.0);
let stats = compute_feature_stats(&data, &bounds, false);
assert_eq!(stats.histogram.min, 0.0);
assert_eq!(stats.histogram.max, 99.0);
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 100);
}
#[test]
fn histogram_with_nan_values() {
let data = vec![10.0_f32, f32::NAN, 20.0, f32::NAN, 30.0];
let bounds = make_fixed_bounds(0.0, 100.0);
let stats = compute_feature_stats(&data, &bounds, false);
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 3);
assert_eq!(stats.histogram.min, 10.0);
assert_eq!(stats.histogram.max, 30.0);
}
#[test]
fn histogram_all_nan() {
let data = vec![f32::NAN, f32::NAN, f32::NAN];
let bounds = make_fixed_bounds(0.0, 100.0);
let stats = compute_feature_stats(&data, &bounds, false);
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 0);
}
#[test]
fn histogram_all_same_value() {
let data = vec![42.0_f32; 1000];
let bounds = make_fixed_bounds(0.0, 100.0);
let stats = compute_feature_stats(&data, &bounds, false);
assert_eq!(stats.histogram.min, 42.0);
assert_eq!(stats.histogram.max, 42.0);
assert_eq!(stats.histogram.p1, 42.0);
assert_eq!(stats.histogram.p99, 42.0);
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 1000);
}
#[test]
fn histogram_percentile_bounds() {
let mut data: Vec<f32> = vec![0.0]; // Low outlier
data.extend((1..99).map(|i| 50.0 + i as f32 * 0.01));
data.push(1000.0); // High outlier
let bounds = make_percentile_bounds(2.0, 98.0);
let stats = compute_feature_stats(&data, &bounds, false);
assert!(stats.slider_min > 0.0);
assert!(stats.slider_max < 1000.0);
}
#[test]
fn fixed_price_bounds_keep_slider_cap() {
let data = vec![400_000.0_f32, 2_500_000.0, 3_750_000.0];
let bounds = make_fixed_bounds(0.0, 2_500_000.0);
let stats = compute_feature_stats(&data, &bounds, false);
assert_eq!(stats.slider_min, 0.0);
assert_eq!(stats.slider_max, 2_500_000.0);
}
#[test]
fn histogram_bin_for_value() {
let hist = Histogram {
min: 0.0,
max: 100.0,
p1: 10.0,
p99: 90.0,
counts: vec![0; 10],
};
assert_eq!(hist.bin_for_value(5.0), 0); // Low outlier bin
assert_eq!(hist.bin_for_value(95.0), 9); // High outlier bin
let mid_value = 50.0;
let bin = hist.bin_for_value(mid_value);
assert!((1..=8).contains(&bin));
}
#[test]
fn histogram_middle_bin_width() {
let hist = Histogram {
min: 0.0,
max: 100.0,
p1: 10.0,
p99: 90.0,
counts: vec![0; 10],
};
let expected_width = (90.0 - 10.0) / 8.0;
assert!((hist.middle_bin_width() - expected_width).abs() < 0.001);
}
#[test]
fn histogram_cardinality_caps_bins() {
let data = vec![1.0_f32, 1.0, 2.0, 2.0, 3.0, 3.0];
let bounds = make_fixed_bounds(0.0, 100.0);
let stats = compute_feature_stats(&data, &bounds, false);
assert_eq!(stats.histogram.counts.len(), 3);
}
#[test]
fn min_max_skips_nan() {
let values = vec![10.0_f32, f32::NAN, 20.0, f32::NAN, 5.0];
let mut min = f32::INFINITY;
let mut max = f32::NEG_INFINITY;
for &v in &values {
if v.is_finite() {
if v < min {
min = v;
}
if v > max {
max = v;
}
}
}
assert_eq!(min, 5.0);
assert_eq!(max, 20.0);
}
#[test]
fn count_skips_nan() {
let values = [1.0_f32, f32::NAN, 2.0, f32::NAN, 3.0];
let count = values.iter().filter(|v| v.is_finite()).count();
assert_eq!(count, 3);
}
#[test]
fn infinity_values_excluded() {
let data = vec![f32::INFINITY, f32::NEG_INFINITY, 50.0];
let bounds = Bounds::Fixed {
min: 0.0,
max: 100.0,
};
let stats = compute_feature_stats(&data, &bounds, false);
assert_eq!(stats.histogram.min, 50.0);
assert_eq!(stats.histogram.max, 50.0);
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 1);
}
#[test]
fn only_finite_values() {
let data = vec![10.0_f32, 20.0, 30.0];
let bounds = Bounds::Fixed {
min: 0.0,
max: 100.0,
};
let stats = compute_feature_stats(&data, &bounds, false);
assert_eq!(stats.histogram.min, 10.0);
assert_eq!(stats.histogram.max, 30.0);
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 3);
}
#[test]
fn extreme_outlier_does_not_destroy_quantization() {
// Simulate floor area: 10k normal values (50-200 sqm) + one 317M outlier
let mut data: Vec<f32> = (0..10_000).map(|i| 50.0 + (i % 150) as f32).collect();
data.push(317_000_000.0); // Extreme outlier from web scraping
let bounds = make_percentile_bounds(0.0, 98.0);
let stats = compute_feature_stats(&data, &bounds, false);
// After refinement, histogram range should be much tighter than 317M
assert!(
stats.histogram.max < 1_000_000.0,
"histogram.max should be refined, got {}",
stats.histogram.max,
);
// p1 should be near 50, not millions
assert!(
stats.histogram.p1 < 100.0,
"p1 should be near real data, got {}",
stats.histogram.p1,
);
// Slider min should reflect actual data range
assert!(
stats.slider_min < 100.0,
"slider_min should be near real data, got {}",
stats.slider_min,
);
// Quantization using histogram.min/max should give usable range
let qmin = stats.histogram.min;
let qrange = stats.histogram.max - stats.histogram.min;
assert!(qrange > 0.0 && qrange < 1_000_000.0);
// A typical floor area (100 sqm) should be distinguishable from min
let normalized = (100.0 - qmin) / qrange;
let encoded = (normalized * QUANT_SCALE).round() as u16;
assert!(
encoded > 100,
"100 sqm should encode to a meaningful u16 value, got {}",
encoded,
);
}
}