Refactor and other improvements

This commit is contained in:
Andras Schmelczer 2026-02-08 18:25:58 +00:00
parent 04a78e7bfe
commit 6c90cf3c0f
47 changed files with 2705 additions and 1568 deletions

View file

@ -0,0 +1,163 @@
use std::collections::{HashMap, HashSet};
use rustc_hash::FxHashMap;
use crate::consts::MAX_PRICE_HISTORY_POINTS;
use crate::data::FeatureStats;
use super::hexagon_stats::{EnumFeatureStats, HistogramStats, NumericFeatureStats, PricePoint};
/// Extract price history (year, price) pairs from matching rows, downsampled if needed.
pub fn extract_price_history(
matching_rows: &[usize],
feature_data: &[f32],
num_features: usize,
feature_name_to_index: &FxHashMap<String, usize>,
) -> Vec<PricePoint> {
let year_idx = feature_name_to_index
.get("Date of last transaction")
.copied();
let price_idx = feature_name_to_index.get("Last known price").copied();
match (year_idx, price_idx) {
(Some(yi), Some(pi)) => {
let mut points: Vec<PricePoint> = matching_rows
.iter()
.filter_map(|&row| {
let year = feature_data[row * num_features + yi];
let price = feature_data[row * num_features + pi];
if year.is_finite() && price.is_finite() {
Some(PricePoint { year, price })
} else {
None
}
})
.collect();
if points.len() > MAX_PRICE_HISTORY_POINTS {
let step = points.len() as f64 / MAX_PRICE_HISTORY_POINTS as f64;
points = (0..MAX_PRICE_HISTORY_POINTS)
.map(|i| {
let idx = (i as f64 * step) as usize;
PricePoint {
year: points[idx].year,
price: points[idx].price,
}
})
.collect();
}
points
}
_ => Vec::new(),
}
}
/// Compute per-feature stats (numeric histograms + enum counts) for the given rows.
#[allow(clippy::too_many_arguments)]
pub fn compute_feature_stats(
matching_rows: &[usize],
feature_data: &[f32],
feature_names: &[String],
num_features: usize,
enum_values: &FxHashMap<usize, Vec<String>>,
feature_stats_data: &[FeatureStats],
fields_specified: bool,
field_set: &HashSet<String>,
) -> (Vec<NumericFeatureStats>, Vec<EnumFeatureStats>) {
let mut numeric_features = Vec::new();
let mut enum_features_out = Vec::new();
for (feature_index, feature_name) in feature_names.iter().enumerate() {
if fields_specified && !field_set.contains(feature_name.as_str()) {
continue;
}
if let Some(ev) = enum_values.get(&feature_index) {
let mut value_counts = vec![0u64; ev.len()];
for &row in matching_rows {
let value = feature_data[row * num_features + feature_index];
if value.is_finite() {
let idx = value as usize;
if idx < value_counts.len() {
value_counts[idx] += 1;
}
}
}
let counts: HashMap<String, u64> = value_counts
.iter()
.enumerate()
.filter(|(_, &count)| count > 0)
.map(|(idx, &count)| (ev[idx].clone(), count))
.collect();
if !counts.is_empty() {
enum_features_out.push(EnumFeatureStats {
name: feature_name.clone(),
counts,
});
}
} else {
let global_hist = &feature_stats_data[feature_index].histogram;
let p1 = global_hist.p1;
let p99 = global_hist.p99;
let num_bins = global_hist.counts.len();
let mut count = 0usize;
let mut min_value = f32::INFINITY;
let mut max_value = f32::NEG_INFINITY;
let mut sum = 0.0f64;
let mut bins = vec![0u64; num_bins];
let middle_bins = num_bins.saturating_sub(2);
let middle_width = if middle_bins > 0 && p99 > p1 {
(p99 - p1) / middle_bins as f32
} else {
0.0
};
for &row in matching_rows {
let value = feature_data[row * num_features + feature_index];
if value.is_finite() {
count += 1;
if value < min_value {
min_value = value;
}
if value > max_value {
max_value = value;
}
sum += value as f64;
let bin = if value < p1 {
0
} else if value >= p99 {
num_bins - 1
} else if middle_width > 0.0 {
let middle_bin = ((value - p1) / middle_width) as usize;
(1 + middle_bin).min(num_bins - 2)
} else {
num_bins / 2
};
bins[bin] += 1;
}
}
if count > 0 {
numeric_features.push(NumericFeatureStats {
name: feature_name.clone(),
count,
min: min_value as f64,
max: max_value as f64,
mean: sum / count as f64,
histogram: HistogramStats {
min: global_hist.min as f64,
max: global_hist.max as f64,
p1: p1 as f64,
p99: p99 as f64,
counts: bins,
},
});
}
}
}
(numeric_features, enum_features_out)
}