Fix floor area outliers
This commit is contained in:
parent
582bc856d8
commit
da1bf49524
1 changed files with 99 additions and 7 deletions
|
|
@ -354,6 +354,45 @@ pub fn compute_feature_stats(vals: &[f32], bounds: &Bounds, integer_bins: bool)
|
|||
let mut p1 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 1.0);
|
||||
let mut p99 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 99.0);
|
||||
|
||||
// Iterative refinement for outlier-dominated distributions.
|
||||
// When extreme outliers (e.g. 317M sqm from web scraping) dominate the range,
|
||||
// the uniform histogram puts all real data in one bin, making percentile
|
||||
// estimation useless. Zoom into the estimated data region and recompute.
|
||||
let mut refined_counts = prelim_counts;
|
||||
let mut refined_count = count;
|
||||
let mut refined_min = min;
|
||||
let mut refined_max = max;
|
||||
for _ in 0..3 {
|
||||
let iqr = p99 - p1;
|
||||
if iqr <= 0.0 || (refined_max - refined_min) <= 5.0 * iqr {
|
||||
break;
|
||||
}
|
||||
let new_min = (p1 - iqr).max(min);
|
||||
let new_max = p99 + iqr;
|
||||
if new_max <= new_min {
|
||||
break;
|
||||
}
|
||||
let bin_width = (new_max - new_min) / HISTOGRAM_BINS as f32;
|
||||
let mut counts = vec![0u64; HISTOGRAM_BINS];
|
||||
let mut cnt = 0usize;
|
||||
for &value in vals {
|
||||
if value.is_finite() && value >= new_min && value <= new_max {
|
||||
let bin = ((value - new_min) / bin_width) as usize;
|
||||
counts[bin.min(HISTOGRAM_BINS - 1)] += 1;
|
||||
cnt += 1;
|
||||
}
|
||||
}
|
||||
if cnt == 0 {
|
||||
break;
|
||||
}
|
||||
p1 = percentile_from_uniform_histogram(cnt, new_min, new_max, &counts, 1.0);
|
||||
p99 = percentile_from_uniform_histogram(cnt, new_min, new_max, &counts, 99.0);
|
||||
refined_counts = counts;
|
||||
refined_count = cnt;
|
||||
refined_min = new_min;
|
||||
refined_max = new_max;
|
||||
}
|
||||
|
||||
// For integer-binned features, snap p1/p99 to integer boundaries
|
||||
// so each middle bin is exactly 1 unit wide.
|
||||
if integer_bins {
|
||||
|
|
@ -411,24 +450,34 @@ pub fn compute_feature_stats(vals: &[f32], bounds: &Bounds, integer_bins: bool)
|
|||
}
|
||||
|
||||
let histogram = Histogram {
|
||||
min,
|
||||
max,
|
||||
min: refined_min,
|
||||
max: refined_max,
|
||||
p1,
|
||||
p99,
|
||||
counts,
|
||||
};
|
||||
|
||||
// Compute slider bounds
|
||||
// Compute slider bounds (use refined histogram for accurate percentiles)
|
||||
let (slider_min, slider_max) = match bounds {
|
||||
Bounds::Fixed {
|
||||
min: fmin,
|
||||
max: fmax,
|
||||
} => (*fmin, *fmax),
|
||||
Bounds::Percentile { low, high } => {
|
||||
let p_low =
|
||||
percentile_from_uniform_histogram(count, min, max, &prelim_counts, *low as f32);
|
||||
let p_high =
|
||||
percentile_from_uniform_histogram(count, min, max, &prelim_counts, *high as f32);
|
||||
let p_low = percentile_from_uniform_histogram(
|
||||
refined_count,
|
||||
refined_min,
|
||||
refined_max,
|
||||
&refined_counts,
|
||||
*low as f32,
|
||||
);
|
||||
let p_high = percentile_from_uniform_histogram(
|
||||
refined_count,
|
||||
refined_min,
|
||||
refined_max,
|
||||
&refined_counts,
|
||||
*high as f32,
|
||||
);
|
||||
(p_low, p_high)
|
||||
}
|
||||
};
|
||||
|
|
@ -1402,4 +1451,47 @@ mod tests {
|
|||
assert_eq!(stats.histogram.max, 30.0);
|
||||
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extreme_outlier_does_not_destroy_quantization() {
|
||||
// Simulate floor area: 10k normal values (50-200 sqm) + one 317M outlier
|
||||
let mut data: Vec<f32> = (0..10_000).map(|i| 50.0 + (i % 150) as f32).collect();
|
||||
data.push(317_000_000.0); // Extreme outlier from web scraping
|
||||
|
||||
let bounds = make_percentile_bounds(0.0, 98.0);
|
||||
let stats = compute_feature_stats(&data, &bounds, false);
|
||||
|
||||
// After refinement, histogram range should be much tighter than 317M
|
||||
assert!(
|
||||
stats.histogram.max < 1_000_000.0,
|
||||
"histogram.max should be refined, got {}",
|
||||
stats.histogram.max,
|
||||
);
|
||||
// p1 should be near 50, not millions
|
||||
assert!(
|
||||
stats.histogram.p1 < 100.0,
|
||||
"p1 should be near real data, got {}",
|
||||
stats.histogram.p1,
|
||||
);
|
||||
// Slider min should reflect actual data range
|
||||
assert!(
|
||||
stats.slider_min < 100.0,
|
||||
"slider_min should be near real data, got {}",
|
||||
stats.slider_min,
|
||||
);
|
||||
|
||||
// Quantization using histogram.min/max should give usable range
|
||||
let qmin = stats.histogram.min;
|
||||
let qrange = stats.histogram.max - stats.histogram.min;
|
||||
assert!(qrange > 0.0 && qrange < 1_000_000.0);
|
||||
|
||||
// A typical floor area (100 sqm) should be distinguishable from min
|
||||
let normalized = (100.0 - qmin) / qrange;
|
||||
let encoded = (normalized * QUANT_SCALE).round() as u16;
|
||||
assert!(
|
||||
encoded > 100,
|
||||
"100 sqm should encode to a meaningful u16 value, got {}",
|
||||
encoded,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue