good stuff

This commit is contained in:
Andras Schmelczer 2026-03-15 21:10:54 +00:00
parent ea8389ef40
commit f4de0eeb9f
39 changed files with 5165 additions and 348 deletions

View file

@ -50,7 +50,32 @@ pub fn extract_price_history(
}
}
/// Per-feature accumulator kind, determined once before the row loop.
enum FeatureAccum {
/// Numeric: track count, min, max, sum, histogram bins.
Numeric {
count: usize,
min_value: f32,
max_value: f32,
sum: f64,
bins: Vec<u64>,
p1: f32,
p99: f32,
middle_width: f32,
num_bins: usize,
global_min: f32,
global_max: f32,
},
/// Enum: count occurrences per variant index.
Enum {
value_counts: Vec<u64>,
},
/// Feature skipped (not in field_set).
Skip,
}
/// Compute per-feature stats (numeric histograms + enum counts) for the given rows.
/// Single-pass: iterates rows in the outer loop for cache-friendly row-major access.
#[allow(clippy::too_many_arguments)]
pub fn compute_feature_stats(
matching_rows: &[usize],
@ -61,107 +86,161 @@ pub fn compute_feature_stats(
fields_specified: bool,
field_set: &HashSet<String>,
) -> (Vec<NumericFeatureStats>, Vec<EnumFeatureStats>) {
let num_features = feature_names.len();
// Pre-allocate accumulators for all features
let mut accums: Vec<FeatureAccum> = (0..num_features)
.map(|fi| {
let feature_name = &feature_names[fi];
if fields_specified && !field_set.contains(feature_name.as_str()) {
return FeatureAccum::Skip;
}
if let Some(ev) = enum_values.get(&fi) {
FeatureAccum::Enum {
value_counts: vec![0u64; ev.len()],
}
} else {
let global_hist = &feature_stats_data[fi].histogram;
let p1 = global_hist.p1;
let p99 = global_hist.p99;
let num_bins = global_hist.counts.len();
let middle_bins = num_bins.saturating_sub(2);
let middle_width = if middle_bins > 0 && p99 > p1 {
(p99 - p1) / middle_bins as f32
} else {
0.0
};
FeatureAccum::Numeric {
count: 0,
min_value: f32::INFINITY,
max_value: f32::NEG_INFINITY,
sum: 0.0,
bins: vec![0u64; num_bins],
p1,
p99,
middle_width,
num_bins,
global_min: global_hist.min,
global_max: global_hist.max,
}
}
})
.collect();
// Single pass: outer loop = rows, inner loop = features (cache-friendly row-major access)
for &row in matching_rows {
for (fi, accum) in accums.iter_mut().enumerate() {
match accum {
FeatureAccum::Skip => {}
FeatureAccum::Enum { value_counts } => {
let value = data.get_feature(row, fi);
if value.is_finite() {
let idx = value as usize;
if idx < value_counts.len() {
value_counts[idx] += 1;
} else {
warn!(
feature = feature_names[fi].as_str(),
idx,
max = value_counts.len(),
"Enum index out of bounds — possible data/schema mismatch"
);
}
}
}
FeatureAccum::Numeric {
count,
min_value,
max_value,
sum,
bins,
p1,
p99,
middle_width,
num_bins,
..
} => {
let value = data.get_feature(row, fi);
if value.is_finite() {
*count += 1;
if value < *min_value {
*min_value = value;
}
if value > *max_value {
*max_value = value;
}
*sum += value as f64;
let bin = if value < *p1 {
0
} else if value >= *p99 {
*num_bins - 1
} else if *middle_width > 0.0 {
let middle_bin = ((value - *p1) / *middle_width) as usize;
(1 + middle_bin).min(*num_bins - 2)
} else {
*num_bins / 2
};
bins[bin] += 1;
}
}
}
}
}
// Build response structs from accumulators
let mut numeric_features = Vec::new();
let mut enum_features_out = Vec::new();
for (feature_index, feature_name) in feature_names.iter().enumerate() {
if fields_specified && !field_set.contains(feature_name.as_str()) {
continue;
}
for (fi, accum) in accums.into_iter().enumerate() {
match accum {
FeatureAccum::Skip => {}
FeatureAccum::Enum { value_counts } => {
let ev = &enum_values[&fi];
let counts: HashMap<String, u64> = value_counts
.iter()
.enumerate()
.filter(|(_, &count)| count > 0)
.map(|(idx, &count)| (ev[idx].clone(), count))
.collect();
if let Some(ev) = enum_values.get(&feature_index) {
let mut value_counts = vec![0u64; ev.len()];
for &row in matching_rows {
let value = data.get_feature(row, feature_index);
if value.is_finite() {
let idx = value as usize;
if idx < value_counts.len() {
value_counts[idx] += 1;
} else {
warn!(
feature = feature_name.as_str(),
idx,
max = value_counts.len(),
"Enum index out of bounds — possible data/schema mismatch"
);
}
if !counts.is_empty() {
enum_features_out.push(EnumFeatureStats {
name: feature_names[fi].clone(),
counts,
});
}
}
let counts: HashMap<String, u64> = value_counts
.iter()
.enumerate()
.filter(|(_, &count)| count > 0)
.map(|(idx, &count)| (ev[idx].clone(), count))
.collect();
if !counts.is_empty() {
enum_features_out.push(EnumFeatureStats {
name: feature_name.clone(),
counts,
});
}
} else {
let global_hist = &feature_stats_data[feature_index].histogram;
let p1 = global_hist.p1;
let p99 = global_hist.p99;
let num_bins = global_hist.counts.len();
let mut count = 0usize;
let mut min_value = f32::INFINITY;
let mut max_value = f32::NEG_INFINITY;
let mut sum = 0.0f64;
let mut bins = vec![0u64; num_bins];
let middle_bins = num_bins.saturating_sub(2);
let middle_width = if middle_bins > 0 && p99 > p1 {
(p99 - p1) / middle_bins as f32
} else {
0.0
};
for &row in matching_rows {
let value = data.get_feature(row, feature_index);
if value.is_finite() {
count += 1;
if value < min_value {
min_value = value;
}
if value > max_value {
max_value = value;
}
sum += value as f64;
let bin = if value < p1 {
0
} else if value >= p99 {
num_bins - 1
} else if middle_width > 0.0 {
let middle_bin = ((value - p1) / middle_width) as usize;
(1 + middle_bin).min(num_bins - 2)
} else {
num_bins / 2
};
bins[bin] += 1;
FeatureAccum::Numeric {
count,
min_value,
max_value,
sum,
bins,
p1,
p99,
global_min,
global_max,
..
} => {
if count > 0 {
numeric_features.push(NumericFeatureStats {
name: feature_names[fi].clone(),
count,
min: min_value as f64,
max: max_value as f64,
mean: sum / count as f64,
histogram: HistogramStats {
min: global_min as f64,
max: global_max as f64,
p1: p1 as f64,
p99: p99 as f64,
counts: bins,
},
});
}
}
if count > 0 {
numeric_features.push(NumericFeatureStats {
name: feature_name.clone(),
count,
min: min_value as f64,
max: max_value as f64,
mean: sum / count as f64,
histogram: HistogramStats {
min: global_hist.min as f64,
max: global_hist.max as f64,
p1: p1 as f64,
p99: p99 as f64,
counts: bins,
},
});
}
}
}