SPlit up
This commit is contained in:
parent
cf39ad754e
commit
f59d01227b
91 changed files with 10370 additions and 7562 deletions
544
server-rs/src/data/property/stats.rs
Normal file
544
server-rs/src/data/property/stats.rs
Normal file
|
|
@ -0,0 +1,544 @@
|
|||
//! Feature statistics: outlier-bracketed histograms, percentile estimation and
|
||||
//! slider-bound computation.
|
||||
|
||||
use anyhow::Context;
|
||||
use polars::prelude::*;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::consts::HISTOGRAM_BINS;
|
||||
use crate::features::Bounds;
|
||||
|
||||
/// Histogram with outlier buckets at the edges.
|
||||
/// - Bin 0: [min, p1) — low outliers
|
||||
/// - Bins 1 to n-2: [p1, p99) — main distribution, evenly divided
|
||||
/// - Bin n-1: [p99, max] — high outliers
|
||||
#[derive(Serialize, Clone)]
|
||||
pub struct Histogram {
|
||||
pub min: f32,
|
||||
pub max: f32,
|
||||
/// 1st percentile (left edge of main distribution)
|
||||
pub p1: f32,
|
||||
/// 99th percentile (right edge of main distribution)
|
||||
pub p99: f32,
|
||||
pub counts: Vec<u64>,
|
||||
}
|
||||
|
||||
impl Histogram {
|
||||
/// Return the bin index for a given value using the outlier-bracket layout.
|
||||
#[cfg(test)]
|
||||
pub fn bin_for_value(&self, value: f32) -> usize {
|
||||
let num_bins = self.counts.len();
|
||||
if value < self.p1 {
|
||||
0
|
||||
} else if value >= self.p99 {
|
||||
num_bins - 1
|
||||
} else {
|
||||
let middle_bins = num_bins.saturating_sub(2);
|
||||
if middle_bins > 0 && self.p99 > self.p1 {
|
||||
let width = (self.p99 - self.p1) / middle_bins as f32;
|
||||
let middle_bin = ((value - self.p1) / width) as usize;
|
||||
(1 + middle_bin).min(num_bins - 2)
|
||||
} else {
|
||||
num_bins / 2
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Width of a single middle bin (bins 1..n-2).
|
||||
#[cfg(test)]
|
||||
pub fn middle_bin_width(&self) -> f32 {
|
||||
let middle_bins = self.counts.len().saturating_sub(2);
|
||||
if middle_bins > 0 && self.p99 > self.p1 {
|
||||
(self.p99 - self.p1) / middle_bins as f32
|
||||
} else {
|
||||
0.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FeatureStats {
|
||||
pub slider_min: f32,
|
||||
pub slider_max: f32,
|
||||
pub histogram: Histogram,
|
||||
}
|
||||
|
||||
/// Compute a percentile from a uniformly-binned histogram.
|
||||
/// `prelim_counts` are uniform bins over [min, max].
|
||||
fn percentile_from_uniform_histogram(
|
||||
count: usize,
|
||||
min: f32,
|
||||
max: f32,
|
||||
prelim_counts: &[u64],
|
||||
percentile: f32,
|
||||
) -> f32 {
|
||||
if count == 0 || prelim_counts.is_empty() {
|
||||
return min;
|
||||
}
|
||||
let target = (count as f64 * percentile as f64 / 100.0).floor() as u64;
|
||||
let bin_width = (max - min) / prelim_counts.len() as f32;
|
||||
let mut cumulative = 0u64;
|
||||
for (i, &bin_count) in prelim_counts.iter().enumerate() {
|
||||
let prev_cumulative = cumulative;
|
||||
cumulative += bin_count;
|
||||
if cumulative > target {
|
||||
// Interpolate within this bin
|
||||
let bin_start = min + i as f32 * bin_width;
|
||||
let fraction = if bin_count > 0 {
|
||||
(target - prev_cumulative) as f32 / bin_count as f32
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
return bin_start + fraction * bin_width;
|
||||
}
|
||||
}
|
||||
max
|
||||
}
|
||||
|
||||
/// Build a histogram and compute slider bounds based on the feature's Bounds config.
|
||||
pub fn compute_feature_stats(vals: &[f32], bounds: &Bounds, integer_bins: bool) -> FeatureStats {
|
||||
// Single pass: min, max, count (skipping NaN and infinity)
|
||||
let mut min = f32::INFINITY;
|
||||
let mut max = f32::NEG_INFINITY;
|
||||
let mut count = 0usize;
|
||||
for &value in vals {
|
||||
if value.is_finite() {
|
||||
if value < min {
|
||||
min = value;
|
||||
}
|
||||
if value > max {
|
||||
max = value;
|
||||
}
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if count == 0 {
|
||||
let (slider_min, slider_max) = match bounds {
|
||||
Bounds::Fixed {
|
||||
min: fmin,
|
||||
max: fmax,
|
||||
} => (*fmin, *fmax),
|
||||
Bounds::Percentile { .. } => (0.0, 0.0),
|
||||
};
|
||||
return FeatureStats {
|
||||
slider_min,
|
||||
slider_max,
|
||||
histogram: Histogram {
|
||||
min: 0.0,
|
||||
max: 0.0,
|
||||
p1: 0.0,
|
||||
p99: 0.0,
|
||||
counts: vec![0; HISTOGRAM_BINS],
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// Build preliminary histogram with uniform bins to compute percentiles
|
||||
// Use full HISTOGRAM_BINS for percentile precision
|
||||
let range = if max == min { 1.0 } else { max - min };
|
||||
let prelim_max = min + range * (1.0 + 1e-6);
|
||||
let prelim_bin_width = (prelim_max - min) / HISTOGRAM_BINS as f32;
|
||||
|
||||
let mut prelim_counts = vec![0u64; HISTOGRAM_BINS];
|
||||
for &value in vals {
|
||||
if value.is_finite() {
|
||||
let bin = ((value - min) / prelim_bin_width) as usize;
|
||||
prelim_counts[bin.min(HISTOGRAM_BINS - 1)] += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Compute p1 and p99 from preliminary histogram
|
||||
let mut p1 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 1.0);
|
||||
let mut p99 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 99.0);
|
||||
|
||||
// Iterative refinement for outlier-dominated distributions.
|
||||
// When extreme outliers (e.g. 317M sqm from web scraping) dominate the range,
|
||||
// the uniform histogram puts all real data in one bin, making percentile
|
||||
// estimation useless. Zoom into the estimated data region and recompute.
|
||||
let mut refined_counts = prelim_counts;
|
||||
let mut refined_count = count;
|
||||
let mut refined_min = min;
|
||||
let mut refined_max = max;
|
||||
for _ in 0..3 {
|
||||
let iqr = p99 - p1;
|
||||
if iqr <= 0.0 || (refined_max - refined_min) <= 5.0 * iqr {
|
||||
break;
|
||||
}
|
||||
let new_min = (p1 - iqr).max(min);
|
||||
let new_max = p99 + iqr;
|
||||
if new_max <= new_min {
|
||||
break;
|
||||
}
|
||||
let bin_width = (new_max - new_min) / HISTOGRAM_BINS as f32;
|
||||
let mut counts = vec![0u64; HISTOGRAM_BINS];
|
||||
let mut cnt = 0usize;
|
||||
for &value in vals {
|
||||
if value.is_finite() && value >= new_min && value <= new_max {
|
||||
let bin = ((value - new_min) / bin_width) as usize;
|
||||
counts[bin.min(HISTOGRAM_BINS - 1)] += 1;
|
||||
cnt += 1;
|
||||
}
|
||||
}
|
||||
if cnt == 0 {
|
||||
break;
|
||||
}
|
||||
p1 = percentile_from_uniform_histogram(cnt, new_min, new_max, &counts, 1.0);
|
||||
p99 = percentile_from_uniform_histogram(cnt, new_min, new_max, &counts, 99.0);
|
||||
refined_counts = counts;
|
||||
refined_count = cnt;
|
||||
refined_min = new_min;
|
||||
refined_max = new_max;
|
||||
}
|
||||
|
||||
// For integer-binned features, snap p1/p99 to integer boundaries
|
||||
// so each middle bin is exactly 1 unit wide.
|
||||
if integer_bins {
|
||||
p1 = p1.floor();
|
||||
p99 = p99.ceil();
|
||||
}
|
||||
|
||||
// Determine number of histogram bins
|
||||
let num_bins = if integer_bins && p99 > p1 {
|
||||
// One middle bin per integer + 2 outlier bins
|
||||
(p99 - p1) as usize + 2
|
||||
} else {
|
||||
// Count unique values within the p1–p99 range to cap histogram bins.
|
||||
// Using the full-range cardinality would over-allocate bins when outliers
|
||||
// inflate it (e.g. bedrooms: 1–137 unique values but only ~10 within p1–p99).
|
||||
let cardinality = {
|
||||
let mut unique_set = rustc_hash::FxHashSet::default();
|
||||
for &val in vals {
|
||||
if val.is_finite() && val >= p1 && val <= p99 {
|
||||
unique_set.insert(val.to_bits());
|
||||
}
|
||||
}
|
||||
unique_set.len()
|
||||
};
|
||||
HISTOGRAM_BINS.min(cardinality).max(3)
|
||||
};
|
||||
|
||||
// Build final histogram with outlier bins at edges:
|
||||
// - Bin 0: [min, p1) — low outliers
|
||||
// - Bins 1 to n-2: [p1, p99) — main distribution, evenly divided
|
||||
// - Bin n-1: [p99, max] — high outliers
|
||||
let mut counts = vec![0u64; num_bins];
|
||||
let middle_bins = num_bins.saturating_sub(2);
|
||||
let middle_width = if middle_bins > 0 && p99 > p1 {
|
||||
(p99 - p1) / middle_bins as f32
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
for &value in vals {
|
||||
if value.is_finite() {
|
||||
let bin = if value < p1 {
|
||||
0 // Low outlier bin
|
||||
} else if value >= p99 {
|
||||
num_bins - 1 // High outlier bin
|
||||
} else if middle_width > 0.0 {
|
||||
// Middle bins (1 to n-2)
|
||||
let middle_bin = ((value - p1) / middle_width) as usize;
|
||||
(1 + middle_bin).min(num_bins - 2)
|
||||
} else {
|
||||
num_bins / 2 // Fallback if p1 == p99
|
||||
};
|
||||
counts[bin] += 1;
|
||||
}
|
||||
}
|
||||
|
||||
let histogram = Histogram {
|
||||
min: refined_min,
|
||||
max: refined_max,
|
||||
p1,
|
||||
p99,
|
||||
counts,
|
||||
};
|
||||
|
||||
// Compute slider bounds (use refined histogram for accurate percentiles)
|
||||
let (slider_min, slider_max) = match bounds {
|
||||
Bounds::Fixed {
|
||||
min: fmin,
|
||||
max: fmax,
|
||||
} => (*fmin, *fmax),
|
||||
Bounds::Percentile { low, high } => {
|
||||
let p_low = percentile_from_uniform_histogram(
|
||||
refined_count,
|
||||
refined_min,
|
||||
refined_max,
|
||||
&refined_counts,
|
||||
*low as f32,
|
||||
);
|
||||
let p_high = percentile_from_uniform_histogram(
|
||||
refined_count,
|
||||
refined_min,
|
||||
refined_max,
|
||||
&refined_counts,
|
||||
*high as f32,
|
||||
);
|
||||
(p_low, p_high)
|
||||
}
|
||||
};
|
||||
|
||||
FeatureStats {
|
||||
slider_min,
|
||||
slider_max,
|
||||
histogram,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn column_to_f32_vec(column: &Column) -> anyhow::Result<Vec<f32>> {
|
||||
let float_series = column
|
||||
.cast(&DataType::Float32)
|
||||
.context("Failed to cast column to Float32")?;
|
||||
let chunked = float_series
|
||||
.f32()
|
||||
.context("Failed to get f32 chunked array")?;
|
||||
Ok(chunked
|
||||
.into_iter()
|
||||
.map(|value| value.unwrap_or(f32::NAN))
|
||||
.collect())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::consts::QUANT_SCALE;
|
||||
use crate::features::Bounds;
|
||||
|
||||
fn make_fixed_bounds(min: f32, max: f32) -> Bounds {
|
||||
Bounds::Fixed { min, max }
|
||||
}
|
||||
|
||||
fn make_percentile_bounds(low: f64, high: f64) -> Bounds {
|
||||
Bounds::Percentile { low, high }
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_empty_data() {
|
||||
let data: Vec<f32> = vec![];
|
||||
let bounds = make_fixed_bounds(0.0, 100.0);
|
||||
let stats = compute_feature_stats(&data, &bounds, false);
|
||||
|
||||
assert_eq!(stats.slider_min, 0.0);
|
||||
assert_eq!(stats.slider_max, 100.0);
|
||||
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_single_value() {
|
||||
let data = vec![50.0_f32];
|
||||
let bounds = make_fixed_bounds(0.0, 100.0);
|
||||
let stats = compute_feature_stats(&data, &bounds, false);
|
||||
|
||||
assert_eq!(stats.histogram.min, 50.0);
|
||||
assert_eq!(stats.histogram.max, 50.0);
|
||||
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_uniform_distribution() {
|
||||
let data: Vec<f32> = (0..100).map(|i| i as f32).collect();
|
||||
let bounds = make_fixed_bounds(0.0, 100.0);
|
||||
let stats = compute_feature_stats(&data, &bounds, false);
|
||||
|
||||
assert_eq!(stats.histogram.min, 0.0);
|
||||
assert_eq!(stats.histogram.max, 99.0);
|
||||
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_with_nan_values() {
|
||||
let data = vec![10.0_f32, f32::NAN, 20.0, f32::NAN, 30.0];
|
||||
let bounds = make_fixed_bounds(0.0, 100.0);
|
||||
let stats = compute_feature_stats(&data, &bounds, false);
|
||||
|
||||
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 3);
|
||||
assert_eq!(stats.histogram.min, 10.0);
|
||||
assert_eq!(stats.histogram.max, 30.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_all_nan() {
|
||||
let data = vec![f32::NAN, f32::NAN, f32::NAN];
|
||||
let bounds = make_fixed_bounds(0.0, 100.0);
|
||||
let stats = compute_feature_stats(&data, &bounds, false);
|
||||
|
||||
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_all_same_value() {
|
||||
let data = vec![42.0_f32; 1000];
|
||||
let bounds = make_fixed_bounds(0.0, 100.0);
|
||||
let stats = compute_feature_stats(&data, &bounds, false);
|
||||
|
||||
assert_eq!(stats.histogram.min, 42.0);
|
||||
assert_eq!(stats.histogram.max, 42.0);
|
||||
assert_eq!(stats.histogram.p1, 42.0);
|
||||
assert_eq!(stats.histogram.p99, 42.0);
|
||||
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 1000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_percentile_bounds() {
|
||||
let mut data: Vec<f32> = vec![0.0]; // Low outlier
|
||||
data.extend((1..99).map(|i| 50.0 + i as f32 * 0.01));
|
||||
data.push(1000.0); // High outlier
|
||||
|
||||
let bounds = make_percentile_bounds(2.0, 98.0);
|
||||
let stats = compute_feature_stats(&data, &bounds, false);
|
||||
|
||||
assert!(stats.slider_min > 0.0);
|
||||
assert!(stats.slider_max < 1000.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fixed_price_bounds_keep_slider_cap() {
|
||||
let data = vec![400_000.0_f32, 2_500_000.0, 3_750_000.0];
|
||||
let bounds = make_fixed_bounds(0.0, 2_500_000.0);
|
||||
let stats = compute_feature_stats(&data, &bounds, false);
|
||||
|
||||
assert_eq!(stats.slider_min, 0.0);
|
||||
assert_eq!(stats.slider_max, 2_500_000.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_bin_for_value() {
|
||||
let hist = Histogram {
|
||||
min: 0.0,
|
||||
max: 100.0,
|
||||
p1: 10.0,
|
||||
p99: 90.0,
|
||||
counts: vec![0; 10],
|
||||
};
|
||||
|
||||
assert_eq!(hist.bin_for_value(5.0), 0); // Low outlier bin
|
||||
assert_eq!(hist.bin_for_value(95.0), 9); // High outlier bin
|
||||
|
||||
let mid_value = 50.0;
|
||||
let bin = hist.bin_for_value(mid_value);
|
||||
assert!((1..=8).contains(&bin));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_middle_bin_width() {
|
||||
let hist = Histogram {
|
||||
min: 0.0,
|
||||
max: 100.0,
|
||||
p1: 10.0,
|
||||
p99: 90.0,
|
||||
counts: vec![0; 10],
|
||||
};
|
||||
|
||||
let expected_width = (90.0 - 10.0) / 8.0;
|
||||
assert!((hist.middle_bin_width() - expected_width).abs() < 0.001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_cardinality_caps_bins() {
|
||||
let data = vec![1.0_f32, 1.0, 2.0, 2.0, 3.0, 3.0];
|
||||
let bounds = make_fixed_bounds(0.0, 100.0);
|
||||
let stats = compute_feature_stats(&data, &bounds, false);
|
||||
|
||||
assert_eq!(stats.histogram.counts.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn min_max_skips_nan() {
|
||||
let values = vec![10.0_f32, f32::NAN, 20.0, f32::NAN, 5.0];
|
||||
|
||||
let mut min = f32::INFINITY;
|
||||
let mut max = f32::NEG_INFINITY;
|
||||
for &v in &values {
|
||||
if v.is_finite() {
|
||||
if v < min {
|
||||
min = v;
|
||||
}
|
||||
if v > max {
|
||||
max = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(min, 5.0);
|
||||
assert_eq!(max, 20.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn count_skips_nan() {
|
||||
let values = [1.0_f32, f32::NAN, 2.0, f32::NAN, 3.0];
|
||||
let count = values.iter().filter(|v| v.is_finite()).count();
|
||||
assert_eq!(count, 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn infinity_values_excluded() {
|
||||
let data = vec![f32::INFINITY, f32::NEG_INFINITY, 50.0];
|
||||
let bounds = Bounds::Fixed {
|
||||
min: 0.0,
|
||||
max: 100.0,
|
||||
};
|
||||
let stats = compute_feature_stats(&data, &bounds, false);
|
||||
|
||||
assert_eq!(stats.histogram.min, 50.0);
|
||||
assert_eq!(stats.histogram.max, 50.0);
|
||||
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn only_finite_values() {
|
||||
let data = vec![10.0_f32, 20.0, 30.0];
|
||||
let bounds = Bounds::Fixed {
|
||||
min: 0.0,
|
||||
max: 100.0,
|
||||
};
|
||||
let stats = compute_feature_stats(&data, &bounds, false);
|
||||
|
||||
assert_eq!(stats.histogram.min, 10.0);
|
||||
assert_eq!(stats.histogram.max, 30.0);
|
||||
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extreme_outlier_does_not_destroy_quantization() {
|
||||
// Simulate floor area: 10k normal values (50-200 sqm) + one 317M outlier
|
||||
let mut data: Vec<f32> = (0..10_000).map(|i| 50.0 + (i % 150) as f32).collect();
|
||||
data.push(317_000_000.0); // Extreme outlier from web scraping
|
||||
|
||||
let bounds = make_percentile_bounds(0.0, 98.0);
|
||||
let stats = compute_feature_stats(&data, &bounds, false);
|
||||
|
||||
// After refinement, histogram range should be much tighter than 317M
|
||||
assert!(
|
||||
stats.histogram.max < 1_000_000.0,
|
||||
"histogram.max should be refined, got {}",
|
||||
stats.histogram.max,
|
||||
);
|
||||
// p1 should be near 50, not millions
|
||||
assert!(
|
||||
stats.histogram.p1 < 100.0,
|
||||
"p1 should be near real data, got {}",
|
||||
stats.histogram.p1,
|
||||
);
|
||||
// Slider min should reflect actual data range
|
||||
assert!(
|
||||
stats.slider_min < 100.0,
|
||||
"slider_min should be near real data, got {}",
|
||||
stats.slider_min,
|
||||
);
|
||||
|
||||
// Quantization using histogram.min/max should give usable range
|
||||
let qmin = stats.histogram.min;
|
||||
let qrange = stats.histogram.max - stats.histogram.min;
|
||||
assert!(qrange > 0.0 && qrange < 1_000_000.0);
|
||||
|
||||
// A typical floor area (100 sqm) should be distinguishable from min
|
||||
let normalized = (100.0 - qmin) / qrange;
|
||||
let encoded = (normalized * QUANT_SCALE).round() as u16;
|
||||
assert!(
|
||||
encoded > 100,
|
||||
"100 sqm should encode to a meaningful u16 value, got {}",
|
||||
encoded,
|
||||
);
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue