Fix floor area outliers
This commit is contained in:
parent
582bc856d8
commit
da1bf49524
1 changed files with 99 additions and 7 deletions
|
|
@ -354,6 +354,45 @@ pub fn compute_feature_stats(vals: &[f32], bounds: &Bounds, integer_bins: bool)
|
||||||
let mut p1 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 1.0);
|
let mut p1 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 1.0);
|
||||||
let mut p99 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 99.0);
|
let mut p99 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 99.0);
|
||||||
|
|
||||||
|
// Iterative refinement for outlier-dominated distributions.
|
||||||
|
// When extreme outliers (e.g. 317M sqm from web scraping) dominate the range,
|
||||||
|
// the uniform histogram puts all real data in one bin, making percentile
|
||||||
|
// estimation useless. Zoom into the estimated data region and recompute.
|
||||||
|
let mut refined_counts = prelim_counts;
|
||||||
|
let mut refined_count = count;
|
||||||
|
let mut refined_min = min;
|
||||||
|
let mut refined_max = max;
|
||||||
|
for _ in 0..3 {
|
||||||
|
let iqr = p99 - p1;
|
||||||
|
if iqr <= 0.0 || (refined_max - refined_min) <= 5.0 * iqr {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let new_min = (p1 - iqr).max(min);
|
||||||
|
let new_max = p99 + iqr;
|
||||||
|
if new_max <= new_min {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let bin_width = (new_max - new_min) / HISTOGRAM_BINS as f32;
|
||||||
|
let mut counts = vec![0u64; HISTOGRAM_BINS];
|
||||||
|
let mut cnt = 0usize;
|
||||||
|
for &value in vals {
|
||||||
|
if value.is_finite() && value >= new_min && value <= new_max {
|
||||||
|
let bin = ((value - new_min) / bin_width) as usize;
|
||||||
|
counts[bin.min(HISTOGRAM_BINS - 1)] += 1;
|
||||||
|
cnt += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if cnt == 0 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
p1 = percentile_from_uniform_histogram(cnt, new_min, new_max, &counts, 1.0);
|
||||||
|
p99 = percentile_from_uniform_histogram(cnt, new_min, new_max, &counts, 99.0);
|
||||||
|
refined_counts = counts;
|
||||||
|
refined_count = cnt;
|
||||||
|
refined_min = new_min;
|
||||||
|
refined_max = new_max;
|
||||||
|
}
|
||||||
|
|
||||||
// For integer-binned features, snap p1/p99 to integer boundaries
|
// For integer-binned features, snap p1/p99 to integer boundaries
|
||||||
// so each middle bin is exactly 1 unit wide.
|
// so each middle bin is exactly 1 unit wide.
|
||||||
if integer_bins {
|
if integer_bins {
|
||||||
|
|
@ -411,24 +450,34 @@ pub fn compute_feature_stats(vals: &[f32], bounds: &Bounds, integer_bins: bool)
|
||||||
}
|
}
|
||||||
|
|
||||||
let histogram = Histogram {
|
let histogram = Histogram {
|
||||||
min,
|
min: refined_min,
|
||||||
max,
|
max: refined_max,
|
||||||
p1,
|
p1,
|
||||||
p99,
|
p99,
|
||||||
counts,
|
counts,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Compute slider bounds
|
// Compute slider bounds (use refined histogram for accurate percentiles)
|
||||||
let (slider_min, slider_max) = match bounds {
|
let (slider_min, slider_max) = match bounds {
|
||||||
Bounds::Fixed {
|
Bounds::Fixed {
|
||||||
min: fmin,
|
min: fmin,
|
||||||
max: fmax,
|
max: fmax,
|
||||||
} => (*fmin, *fmax),
|
} => (*fmin, *fmax),
|
||||||
Bounds::Percentile { low, high } => {
|
Bounds::Percentile { low, high } => {
|
||||||
let p_low =
|
let p_low = percentile_from_uniform_histogram(
|
||||||
percentile_from_uniform_histogram(count, min, max, &prelim_counts, *low as f32);
|
refined_count,
|
||||||
let p_high =
|
refined_min,
|
||||||
percentile_from_uniform_histogram(count, min, max, &prelim_counts, *high as f32);
|
refined_max,
|
||||||
|
&refined_counts,
|
||||||
|
*low as f32,
|
||||||
|
);
|
||||||
|
let p_high = percentile_from_uniform_histogram(
|
||||||
|
refined_count,
|
||||||
|
refined_min,
|
||||||
|
refined_max,
|
||||||
|
&refined_counts,
|
||||||
|
*high as f32,
|
||||||
|
);
|
||||||
(p_low, p_high)
|
(p_low, p_high)
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
@ -1402,4 +1451,47 @@ mod tests {
|
||||||
assert_eq!(stats.histogram.max, 30.0);
|
assert_eq!(stats.histogram.max, 30.0);
|
||||||
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 3);
|
assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn extreme_outlier_does_not_destroy_quantization() {
|
||||||
|
// Simulate floor area: 10k normal values (50-200 sqm) + one 317M outlier
|
||||||
|
let mut data: Vec<f32> = (0..10_000).map(|i| 50.0 + (i % 150) as f32).collect();
|
||||||
|
data.push(317_000_000.0); // Extreme outlier from web scraping
|
||||||
|
|
||||||
|
let bounds = make_percentile_bounds(0.0, 98.0);
|
||||||
|
let stats = compute_feature_stats(&data, &bounds, false);
|
||||||
|
|
||||||
|
// After refinement, histogram range should be much tighter than 317M
|
||||||
|
assert!(
|
||||||
|
stats.histogram.max < 1_000_000.0,
|
||||||
|
"histogram.max should be refined, got {}",
|
||||||
|
stats.histogram.max,
|
||||||
|
);
|
||||||
|
// p1 should be near 50, not millions
|
||||||
|
assert!(
|
||||||
|
stats.histogram.p1 < 100.0,
|
||||||
|
"p1 should be near real data, got {}",
|
||||||
|
stats.histogram.p1,
|
||||||
|
);
|
||||||
|
// Slider min should reflect actual data range
|
||||||
|
assert!(
|
||||||
|
stats.slider_min < 100.0,
|
||||||
|
"slider_min should be near real data, got {}",
|
||||||
|
stats.slider_min,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Quantization using histogram.min/max should give usable range
|
||||||
|
let qmin = stats.histogram.min;
|
||||||
|
let qrange = stats.histogram.max - stats.histogram.min;
|
||||||
|
assert!(qrange > 0.0 && qrange < 1_000_000.0);
|
||||||
|
|
||||||
|
// A typical floor area (100 sqm) should be distinguishable from min
|
||||||
|
let normalized = (100.0 - qmin) / qrange;
|
||||||
|
let encoded = (normalized * QUANT_SCALE).round() as u16;
|
||||||
|
assert!(
|
||||||
|
encoded > 100,
|
||||||
|
"100 sqm should encode to a meaningful u16 value, got {}",
|
||||||
|
encoded,
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue