Fix floor area outliers

This commit is contained in:
Andras Schmelczer 2026-03-24 22:53:22 +00:00
parent 582bc856d8
commit da1bf49524

View file

@ -354,6 +354,45 @@ pub fn compute_feature_stats(vals: &[f32], bounds: &Bounds, integer_bins: bool)
let mut p1 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 1.0);
let mut p99 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 99.0);
// Iterative refinement for outlier-dominated distributions.
// When extreme outliers (e.g. 317M sqm from web scraping) dominate the range,
// the uniform histogram puts all real data in one bin, making percentile
// estimation useless. Zoom into the estimated data region and recompute.
let mut refined_counts = prelim_counts;
let mut refined_count = count;
let mut refined_min = min;
let mut refined_max = max;
for _ in 0..3 {
let iqr = p99 - p1;
if iqr <= 0.0 || (refined_max - refined_min) <= 5.0 * iqr {
break;
}
let new_min = (p1 - iqr).max(min);
let new_max = p99 + iqr;
if new_max <= new_min {
break;
}
let bin_width = (new_max - new_min) / HISTOGRAM_BINS as f32;
let mut counts = vec![0u64; HISTOGRAM_BINS];
let mut cnt = 0usize;
for &value in vals {
if value.is_finite() && value >= new_min && value <= new_max {
let bin = ((value - new_min) / bin_width) as usize;
counts[bin.min(HISTOGRAM_BINS - 1)] += 1;
cnt += 1;
}
}
if cnt == 0 {
break;
}
p1 = percentile_from_uniform_histogram(cnt, new_min, new_max, &counts, 1.0);
p99 = percentile_from_uniform_histogram(cnt, new_min, new_max, &counts, 99.0);
refined_counts = counts;
refined_count = cnt;
refined_min = new_min;
refined_max = new_max;
}
// For integer-binned features, snap p1/p99 to integer boundaries
// so each middle bin is exactly 1 unit wide.
if integer_bins {
@ -411,24 +450,34 @@ pub fn compute_feature_stats(vals: &[f32], bounds: &Bounds, integer_bins: bool)
}
let histogram = Histogram {
min,
max,
min: refined_min,
max: refined_max,
p1,
p99,
counts,
};
// Compute slider bounds
// Compute slider bounds (use refined histogram for accurate percentiles)
let (slider_min, slider_max) = match bounds {
Bounds::Fixed {
min: fmin,
max: fmax,
} => (*fmin, *fmax),
Bounds::Percentile { low, high } => {
let p_low =
percentile_from_uniform_histogram(count, min, max, &prelim_counts, *low as f32);
let p_high =
percentile_from_uniform_histogram(count, min, max, &prelim_counts, *high as f32);
let p_low = percentile_from_uniform_histogram(
refined_count,
refined_min,
refined_max,
&refined_counts,
*low as f32,
);
let p_high = percentile_from_uniform_histogram(
refined_count,
refined_min,
refined_max,
&refined_counts,
*high as f32,
);
(p_low, p_high)
}
};
@ -1402,4 +1451,47 @@ mod tests {
assert_eq!(stats.histogram.max, 30.0);
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 3);
}
#[test]
fn extreme_outlier_does_not_destroy_quantization() {
// Simulate floor area: 10k normal values (50-200 sqm) + one 317M outlier
let mut data: Vec<f32> = (0..10_000).map(|i| 50.0 + (i % 150) as f32).collect();
data.push(317_000_000.0); // Extreme outlier from web scraping
let bounds = make_percentile_bounds(0.0, 98.0);
let stats = compute_feature_stats(&data, &bounds, false);
// After refinement, histogram range should be much tighter than 317M
assert!(
stats.histogram.max < 1_000_000.0,
"histogram.max should be refined, got {}",
stats.histogram.max,
);
// p1 should be near 50, not millions
assert!(
stats.histogram.p1 < 100.0,
"p1 should be near real data, got {}",
stats.histogram.p1,
);
// Slider min should reflect actual data range
assert!(
stats.slider_min < 100.0,
"slider_min should be near real data, got {}",
stats.slider_min,
);
// Quantization using histogram.min/max should give usable range
let qmin = stats.histogram.min;
let qrange = stats.histogram.max - stats.histogram.min;
assert!(qrange > 0.0 && qrange < 1_000_000.0);
// A typical floor area (100 sqm) should be distinguishable from min
let normalized = (100.0 - qmin) / qrange;
let encoded = (normalized * QUANT_SCALE).round() as u16;
assert!(
encoded > 100,
"100 sqm should encode to a meaningful u16 value, got {}",
encoded,
);
}
}