good stuff
This commit is contained in:
parent
ea8389ef40
commit
f4de0eeb9f
39 changed files with 5165 additions and 348 deletions
|
|
@ -50,7 +50,32 @@ pub fn extract_price_history(
|
|||
}
|
||||
}
|
||||
|
||||
/// Per-feature accumulator kind, determined once before the row loop.
|
||||
enum FeatureAccum {
|
||||
/// Numeric: track count, min, max, sum, histogram bins.
|
||||
Numeric {
|
||||
count: usize,
|
||||
min_value: f32,
|
||||
max_value: f32,
|
||||
sum: f64,
|
||||
bins: Vec<u64>,
|
||||
p1: f32,
|
||||
p99: f32,
|
||||
middle_width: f32,
|
||||
num_bins: usize,
|
||||
global_min: f32,
|
||||
global_max: f32,
|
||||
},
|
||||
/// Enum: count occurrences per variant index.
|
||||
Enum {
|
||||
value_counts: Vec<u64>,
|
||||
},
|
||||
/// Feature skipped (not in field_set).
|
||||
Skip,
|
||||
}
|
||||
|
||||
/// Compute per-feature stats (numeric histograms + enum counts) for the given rows.
|
||||
/// Single-pass: iterates rows in the outer loop for cache-friendly row-major access.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn compute_feature_stats(
|
||||
matching_rows: &[usize],
|
||||
|
|
@ -61,107 +86,161 @@ pub fn compute_feature_stats(
|
|||
fields_specified: bool,
|
||||
field_set: &HashSet<String>,
|
||||
) -> (Vec<NumericFeatureStats>, Vec<EnumFeatureStats>) {
|
||||
let num_features = feature_names.len();
|
||||
|
||||
// Pre-allocate accumulators for all features
|
||||
let mut accums: Vec<FeatureAccum> = (0..num_features)
|
||||
.map(|fi| {
|
||||
let feature_name = &feature_names[fi];
|
||||
if fields_specified && !field_set.contains(feature_name.as_str()) {
|
||||
return FeatureAccum::Skip;
|
||||
}
|
||||
|
||||
if let Some(ev) = enum_values.get(&fi) {
|
||||
FeatureAccum::Enum {
|
||||
value_counts: vec![0u64; ev.len()],
|
||||
}
|
||||
} else {
|
||||
let global_hist = &feature_stats_data[fi].histogram;
|
||||
let p1 = global_hist.p1;
|
||||
let p99 = global_hist.p99;
|
||||
let num_bins = global_hist.counts.len();
|
||||
let middle_bins = num_bins.saturating_sub(2);
|
||||
let middle_width = if middle_bins > 0 && p99 > p1 {
|
||||
(p99 - p1) / middle_bins as f32
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
FeatureAccum::Numeric {
|
||||
count: 0,
|
||||
min_value: f32::INFINITY,
|
||||
max_value: f32::NEG_INFINITY,
|
||||
sum: 0.0,
|
||||
bins: vec![0u64; num_bins],
|
||||
p1,
|
||||
p99,
|
||||
middle_width,
|
||||
num_bins,
|
||||
global_min: global_hist.min,
|
||||
global_max: global_hist.max,
|
||||
}
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Single pass: outer loop = rows, inner loop = features (cache-friendly row-major access)
|
||||
for &row in matching_rows {
|
||||
for (fi, accum) in accums.iter_mut().enumerate() {
|
||||
match accum {
|
||||
FeatureAccum::Skip => {}
|
||||
FeatureAccum::Enum { value_counts } => {
|
||||
let value = data.get_feature(row, fi);
|
||||
if value.is_finite() {
|
||||
let idx = value as usize;
|
||||
if idx < value_counts.len() {
|
||||
value_counts[idx] += 1;
|
||||
} else {
|
||||
warn!(
|
||||
feature = feature_names[fi].as_str(),
|
||||
idx,
|
||||
max = value_counts.len(),
|
||||
"Enum index out of bounds — possible data/schema mismatch"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
FeatureAccum::Numeric {
|
||||
count,
|
||||
min_value,
|
||||
max_value,
|
||||
sum,
|
||||
bins,
|
||||
p1,
|
||||
p99,
|
||||
middle_width,
|
||||
num_bins,
|
||||
..
|
||||
} => {
|
||||
let value = data.get_feature(row, fi);
|
||||
if value.is_finite() {
|
||||
*count += 1;
|
||||
if value < *min_value {
|
||||
*min_value = value;
|
||||
}
|
||||
if value > *max_value {
|
||||
*max_value = value;
|
||||
}
|
||||
*sum += value as f64;
|
||||
|
||||
let bin = if value < *p1 {
|
||||
0
|
||||
} else if value >= *p99 {
|
||||
*num_bins - 1
|
||||
} else if *middle_width > 0.0 {
|
||||
let middle_bin = ((value - *p1) / *middle_width) as usize;
|
||||
(1 + middle_bin).min(*num_bins - 2)
|
||||
} else {
|
||||
*num_bins / 2
|
||||
};
|
||||
bins[bin] += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Build response structs from accumulators
|
||||
let mut numeric_features = Vec::new();
|
||||
let mut enum_features_out = Vec::new();
|
||||
|
||||
for (feature_index, feature_name) in feature_names.iter().enumerate() {
|
||||
if fields_specified && !field_set.contains(feature_name.as_str()) {
|
||||
continue;
|
||||
}
|
||||
for (fi, accum) in accums.into_iter().enumerate() {
|
||||
match accum {
|
||||
FeatureAccum::Skip => {}
|
||||
FeatureAccum::Enum { value_counts } => {
|
||||
let ev = &enum_values[&fi];
|
||||
let counts: HashMap<String, u64> = value_counts
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, &count)| count > 0)
|
||||
.map(|(idx, &count)| (ev[idx].clone(), count))
|
||||
.collect();
|
||||
|
||||
if let Some(ev) = enum_values.get(&feature_index) {
|
||||
let mut value_counts = vec![0u64; ev.len()];
|
||||
for &row in matching_rows {
|
||||
let value = data.get_feature(row, feature_index);
|
||||
if value.is_finite() {
|
||||
let idx = value as usize;
|
||||
if idx < value_counts.len() {
|
||||
value_counts[idx] += 1;
|
||||
} else {
|
||||
warn!(
|
||||
feature = feature_name.as_str(),
|
||||
idx,
|
||||
max = value_counts.len(),
|
||||
"Enum index out of bounds — possible data/schema mismatch"
|
||||
);
|
||||
}
|
||||
if !counts.is_empty() {
|
||||
enum_features_out.push(EnumFeatureStats {
|
||||
name: feature_names[fi].clone(),
|
||||
counts,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let counts: HashMap<String, u64> = value_counts
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, &count)| count > 0)
|
||||
.map(|(idx, &count)| (ev[idx].clone(), count))
|
||||
.collect();
|
||||
|
||||
if !counts.is_empty() {
|
||||
enum_features_out.push(EnumFeatureStats {
|
||||
name: feature_name.clone(),
|
||||
counts,
|
||||
});
|
||||
}
|
||||
} else {
|
||||
let global_hist = &feature_stats_data[feature_index].histogram;
|
||||
let p1 = global_hist.p1;
|
||||
let p99 = global_hist.p99;
|
||||
let num_bins = global_hist.counts.len();
|
||||
|
||||
let mut count = 0usize;
|
||||
let mut min_value = f32::INFINITY;
|
||||
let mut max_value = f32::NEG_INFINITY;
|
||||
let mut sum = 0.0f64;
|
||||
let mut bins = vec![0u64; num_bins];
|
||||
|
||||
let middle_bins = num_bins.saturating_sub(2);
|
||||
let middle_width = if middle_bins > 0 && p99 > p1 {
|
||||
(p99 - p1) / middle_bins as f32
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
for &row in matching_rows {
|
||||
let value = data.get_feature(row, feature_index);
|
||||
if value.is_finite() {
|
||||
count += 1;
|
||||
if value < min_value {
|
||||
min_value = value;
|
||||
}
|
||||
if value > max_value {
|
||||
max_value = value;
|
||||
}
|
||||
sum += value as f64;
|
||||
|
||||
let bin = if value < p1 {
|
||||
0
|
||||
} else if value >= p99 {
|
||||
num_bins - 1
|
||||
} else if middle_width > 0.0 {
|
||||
let middle_bin = ((value - p1) / middle_width) as usize;
|
||||
(1 + middle_bin).min(num_bins - 2)
|
||||
} else {
|
||||
num_bins / 2
|
||||
};
|
||||
bins[bin] += 1;
|
||||
FeatureAccum::Numeric {
|
||||
count,
|
||||
min_value,
|
||||
max_value,
|
||||
sum,
|
||||
bins,
|
||||
p1,
|
||||
p99,
|
||||
global_min,
|
||||
global_max,
|
||||
..
|
||||
} => {
|
||||
if count > 0 {
|
||||
numeric_features.push(NumericFeatureStats {
|
||||
name: feature_names[fi].clone(),
|
||||
count,
|
||||
min: min_value as f64,
|
||||
max: max_value as f64,
|
||||
mean: sum / count as f64,
|
||||
histogram: HistogramStats {
|
||||
min: global_min as f64,
|
||||
max: global_max as f64,
|
||||
p1: p1 as f64,
|
||||
p99: p99 as f64,
|
||||
counts: bins,
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if count > 0 {
|
||||
numeric_features.push(NumericFeatureStats {
|
||||
name: feature_name.clone(),
|
||||
count,
|
||||
min: min_value as f64,
|
||||
max: max_value as f64,
|
||||
mean: sum / count as f64,
|
||||
histogram: HistogramStats {
|
||||
min: global_hist.min as f64,
|
||||
max: global_hist.max as f64,
|
||||
p1: p1 as f64,
|
||||
p99: p99 as f64,
|
||||
counts: bins,
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue