use crate::consts::NAN_U16; use crate::data::QuantRef; /// Optional per-enum-value distribution tracking for a single feature. /// Counts how many rows have each enum value (by raw u16 index). pub struct EnumDist { pub feat_idx: usize, pub counts: Box<[u32]>, } /// Per-cell accumulator for aggregating features (min/max/sum/count). /// Uses Box<[T]> instead of Vec to avoid storing capacity (saves 8 bytes per field per cell). /// Shared by hexagon and postcode aggregation routes. pub struct Aggregator { pub count: u32, pub mins: Box<[f32]>, pub maxs: Box<[f32]>, pub sums: Box<[f64]>, pub feat_counts: Box<[u32]>, /// Optional: per-value counts for a single enum feature (for pie chart visualization). pub enum_dist: Option, } /// Configuration for enum distribution tracking, passed to Aggregator::new. /// (feature_index, number_of_enum_values) pub type EnumDistConfig = Option<(usize, usize)>; impl Aggregator { pub fn new(num_features: usize, enum_dist_config: EnumDistConfig) -> Self { Aggregator { count: 0, mins: vec![f32::INFINITY; num_features].into_boxed_slice(), maxs: vec![f32::NEG_INFINITY; num_features].into_boxed_slice(), sums: vec![0.0f64; num_features].into_boxed_slice(), feat_counts: vec![0u32; num_features].into_boxed_slice(), enum_dist: enum_dist_config.map(|(feat_idx, num_values)| EnumDist { feat_idx, counts: vec![0u32; num_values].into_boxed_slice(), }), } } /// Add a row using row-major feature_data layout (quantized u16). /// feature_data[row * num_features + feat_idx] — all features for one row /// are contiguous, so this reads a single cache line per ~16 features. #[inline] pub fn add_row( &mut self, feature_data: &[u16], row: usize, num_features: usize, quant: &QuantRef, ) { self.count += 1; let base = row * num_features; let row_slice = &feature_data[base..base + num_features]; for (feat_index, &raw) in row_slice.iter().enumerate() { if raw != NAN_U16 { let value = quant.decode(feat_index, raw); if value < self.mins[feat_index] { self.mins[feat_index] = value; } if value > self.maxs[feat_index] { self.maxs[feat_index] = value; } self.sums[feat_index] += value as f64; self.feat_counts[feat_index] += 1; } } // Enum distribution: single branch per row (not per feature). // Uses raw u16 directly — enum features are stored as u16 indices. if let Some(ref mut ed) = self.enum_dist { let raw = row_slice[ed.feat_idx]; if raw != NAN_U16 { let idx = raw as usize; if idx < ed.counts.len() { ed.counts[idx] += 1; } } } } /// Merge another aggregator's results into this one. pub fn merge(&mut self, other: &Aggregator) { self.count += other.count; for i in 0..self.mins.len() { if other.feat_counts[i] > 0 { if other.mins[i] < self.mins[i] { self.mins[i] = other.mins[i]; } if other.maxs[i] > self.maxs[i] { self.maxs[i] = other.maxs[i]; } self.sums[i] += other.sums[i]; self.feat_counts[i] += other.feat_counts[i]; } } // Merge enum distribution counts if let (Some(ref mut mine), Some(ref theirs)) = (&mut self.enum_dist, &other.enum_dist) { for (m, t) in mine.counts.iter_mut().zip(theirs.counts.iter()) { *m += t; } } } /// Add a row, only aggregating the features at the given indices. #[inline] pub fn add_row_selective( &mut self, feature_data: &[u16], row: usize, num_features: usize, indices: &[usize], quant: &QuantRef, ) { self.count += 1; let base = row * num_features; for &feat_index in indices { let raw = feature_data[base + feat_index]; if raw != NAN_U16 { let value = quant.decode(feat_index, raw); if value < self.mins[feat_index] { self.mins[feat_index] = value; } if value > self.maxs[feat_index] { self.maxs[feat_index] = value; } self.sums[feat_index] += value as f64; self.feat_counts[feat_index] += 1; } } // Enum distribution (same raw u16 approach) if let Some(ref mut ed) = self.enum_dist { let raw = feature_data[base + ed.feat_idx]; if raw != NAN_U16 { let idx = raw as usize; if idx < ed.counts.len() { ed.counts[idx] += 1; } } } } }