From da1bf495247d3bd6d94e32236819b7a71cf2620c Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Tue, 24 Mar 2026 22:53:22 +0000 Subject: [PATCH] Fix floor area outliers --- server-rs/src/data/property.rs | 106 ++++++++++++++++++++++++++++++--- 1 file changed, 99 insertions(+), 7 deletions(-) diff --git a/server-rs/src/data/property.rs b/server-rs/src/data/property.rs index a95d4e5..9e8a142 100644 --- a/server-rs/src/data/property.rs +++ b/server-rs/src/data/property.rs @@ -354,6 +354,45 @@ pub fn compute_feature_stats(vals: &[f32], bounds: &Bounds, integer_bins: bool) let mut p1 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 1.0); let mut p99 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 99.0); + // Iterative refinement for outlier-dominated distributions. + // When extreme outliers (e.g. 317M sqm from web scraping) dominate the range, + // the uniform histogram puts all real data in one bin, making percentile + // estimation useless. Zoom into the estimated data region and recompute. + let mut refined_counts = prelim_counts; + let mut refined_count = count; + let mut refined_min = min; + let mut refined_max = max; + for _ in 0..3 { + let iqr = p99 - p1; + if iqr <= 0.0 || (refined_max - refined_min) <= 5.0 * iqr { + break; + } + let new_min = (p1 - iqr).max(min); + let new_max = p99 + iqr; + if new_max <= new_min { + break; + } + let bin_width = (new_max - new_min) / HISTOGRAM_BINS as f32; + let mut counts = vec![0u64; HISTOGRAM_BINS]; + let mut cnt = 0usize; + for &value in vals { + if value.is_finite() && value >= new_min && value <= new_max { + let bin = ((value - new_min) / bin_width) as usize; + counts[bin.min(HISTOGRAM_BINS - 1)] += 1; + cnt += 1; + } + } + if cnt == 0 { + break; + } + p1 = percentile_from_uniform_histogram(cnt, new_min, new_max, &counts, 1.0); + p99 = percentile_from_uniform_histogram(cnt, new_min, new_max, &counts, 99.0); + refined_counts = counts; + refined_count = cnt; + refined_min = new_min; + refined_max = new_max; + } + // For integer-binned features, snap p1/p99 to integer boundaries // so each middle bin is exactly 1 unit wide. if integer_bins { @@ -411,24 +450,34 @@ pub fn compute_feature_stats(vals: &[f32], bounds: &Bounds, integer_bins: bool) } let histogram = Histogram { - min, - max, + min: refined_min, + max: refined_max, p1, p99, counts, }; - // Compute slider bounds + // Compute slider bounds (use refined histogram for accurate percentiles) let (slider_min, slider_max) = match bounds { Bounds::Fixed { min: fmin, max: fmax, } => (*fmin, *fmax), Bounds::Percentile { low, high } => { - let p_low = - percentile_from_uniform_histogram(count, min, max, &prelim_counts, *low as f32); - let p_high = - percentile_from_uniform_histogram(count, min, max, &prelim_counts, *high as f32); + let p_low = percentile_from_uniform_histogram( + refined_count, + refined_min, + refined_max, + &refined_counts, + *low as f32, + ); + let p_high = percentile_from_uniform_histogram( + refined_count, + refined_min, + refined_max, + &refined_counts, + *high as f32, + ); (p_low, p_high) } }; @@ -1402,4 +1451,47 @@ mod tests { assert_eq!(stats.histogram.max, 30.0); assert_eq!(stats.histogram.counts.iter().sum::(), 3); } + + #[test] + fn extreme_outlier_does_not_destroy_quantization() { + // Simulate floor area: 10k normal values (50-200 sqm) + one 317M outlier + let mut data: Vec = (0..10_000).map(|i| 50.0 + (i % 150) as f32).collect(); + data.push(317_000_000.0); // Extreme outlier from web scraping + + let bounds = make_percentile_bounds(0.0, 98.0); + let stats = compute_feature_stats(&data, &bounds, false); + + // After refinement, histogram range should be much tighter than 317M + assert!( + stats.histogram.max < 1_000_000.0, + "histogram.max should be refined, got {}", + stats.histogram.max, + ); + // p1 should be near 50, not millions + assert!( + stats.histogram.p1 < 100.0, + "p1 should be near real data, got {}", + stats.histogram.p1, + ); + // Slider min should reflect actual data range + assert!( + stats.slider_min < 100.0, + "slider_min should be near real data, got {}", + stats.slider_min, + ); + + // Quantization using histogram.min/max should give usable range + let qmin = stats.histogram.min; + let qrange = stats.histogram.max - stats.histogram.min; + assert!(qrange > 0.0 && qrange < 1_000_000.0); + + // A typical floor area (100 sqm) should be distinguishable from min + let normalized = (100.0 - qmin) / qrange; + let encoded = (normalized * QUANT_SCALE).round() as u16; + assert!( + encoded > 100, + "100 sqm should encode to a meaningful u16 value, got {}", + encoded, + ); + } }