perfect-postcode/server-rs/src/aggregation.rs

use crate::consts::NAN_U16;
use crate::data::QuantRef;

/// Optional per-enum-value distribution tracking for a single feature.
/// Counts how many rows have each enum value (by raw u16 index).
pub struct EnumDist {
    pub feat_idx: usize,
    pub counts: Box<[u32]>,
}

/// Per-cell accumulator for aggregating features (min/max/sum/count).
/// Uses Box<[T]> instead of Vec<T> to avoid storing capacity (saves 8 bytes per field per cell).
/// Shared by hexagon and postcode aggregation routes.
pub struct Aggregator {
    pub count: u32,
    pub mins: Box<[f32]>,
    pub maxs: Box<[f32]>,
    pub sums: Box<[f64]>,
    pub feat_counts: Box<[u32]>,
    /// Optional: per-value counts for a single enum feature (for pie chart visualization).
    pub enum_dist: Option<EnumDist>,
}

/// Configuration for enum distribution tracking, passed to Aggregator::new.
/// (feature_index, number_of_enum_values)
pub type EnumDistConfig = Option<(usize, usize)>;

impl Aggregator {
    pub fn new(num_features: usize, enum_dist_config: EnumDistConfig) -> Self {
        Aggregator {
            count: 0,
            mins: vec![f32::INFINITY; num_features].into_boxed_slice(),
            maxs: vec![f32::NEG_INFINITY; num_features].into_boxed_slice(),
            sums: vec![0.0f64; num_features].into_boxed_slice(),
            feat_counts: vec![0u32; num_features].into_boxed_slice(),
            enum_dist: enum_dist_config.map(|(feat_idx, num_values)| EnumDist {
                feat_idx,
                counts: vec![0u32; num_values].into_boxed_slice(),
            }),
        }
    }

    /// Add a row using row-major feature_data layout (quantized u16).
    /// feature_data[row * num_features + feat_idx] — all features for one row
    /// are contiguous, so this reads a single cache line per ~16 features.
    #[inline]
    pub fn add_row(
        &mut self,
        feature_data: &[u16],
        row: usize,
        num_features: usize,
        quant: &QuantRef,
    ) {
        self.count += 1;
        let base = row * num_features;
        let row_slice = &feature_data[base..base + num_features];
        for (feat_index, &raw) in row_slice.iter().enumerate() {
            if raw != NAN_U16 {
                let value = quant.decode(feat_index, raw);
                if value < self.mins[feat_index] {
                    self.mins[feat_index] = value;
                }
                if value > self.maxs[feat_index] {
                    self.maxs[feat_index] = value;
                }
                self.sums[feat_index] += value as f64;
                self.feat_counts[feat_index] += 1;
            }
        }
        // Enum distribution: single branch per row (not per feature).
        // Uses raw u16 directly — enum features are stored as u16 indices.
        if let Some(ref mut ed) = self.enum_dist {
            let raw = row_slice[ed.feat_idx];
            if raw != NAN_U16 {
                let idx = raw as usize;
                if idx < ed.counts.len() {
                    ed.counts[idx] += 1;
                }
            }
        }
    }

    /// Merge another aggregator's results into this one.
    pub fn merge(&mut self, other: &Aggregator) {
        self.count += other.count;
        for i in 0..self.mins.len() {
            if other.feat_counts[i] > 0 {
                if other.mins[i] < self.mins[i] {
                    self.mins[i] = other.mins[i];
                }
                if other.maxs[i] > self.maxs[i] {
                    self.maxs[i] = other.maxs[i];
                }
                self.sums[i] += other.sums[i];
                self.feat_counts[i] += other.feat_counts[i];
            }
        }
        // Merge enum distribution counts
        if let (Some(ref mut mine), Some(ref theirs)) = (&mut self.enum_dist, &other.enum_dist) {
            for (m, t) in mine.counts.iter_mut().zip(theirs.counts.iter()) {
                *m += t;
            }
        }
    }

    /// Add a row, only aggregating the features at the given indices.
    #[inline]
    pub fn add_row_selective(
        &mut self,
        feature_data: &[u16],
        row: usize,
        num_features: usize,
        indices: &[usize],
        quant: &QuantRef,
    ) {
        self.count += 1;
        let base = row * num_features;
        for &feat_index in indices {
            let raw = feature_data[base + feat_index];
            if raw != NAN_U16 {
                let value = quant.decode(feat_index, raw);
                if value < self.mins[feat_index] {
                    self.mins[feat_index] = value;
                }
                if value > self.maxs[feat_index] {
                    self.maxs[feat_index] = value;
                }
                self.sums[feat_index] += value as f64;
                self.feat_counts[feat_index] += 1;
            }
        }
        // Enum distribution (same raw u16 approach)
        if let Some(ref mut ed) = self.enum_dist {
            let raw = feature_data[base + ed.feat_idx];
            if raw != NAN_U16 {
                let idx = raw as usize;
                if idx < ed.counts.len() {
                    ed.counts[idx] += 1;
                }
            }
        }
    }
}