perfect-postcode/server-rs/src/data/property.rs

use anyhow::{bail, Context};
use polars::lazy::frame::LazyFrame;
use polars::prelude::*;
use rayon::prelude::*;
use serde::Serialize;
use std::path::Path;

use rustc_hash::{FxHashMap, FxHashSet};

use crate::consts::{H3_PRECOMPUTE_MAX, HISTOGRAM_BINS, NAN_U16, QUANT_SCALE};
use crate::features::{self, Bounds};

const ADDRESS_SEARCH_CANDIDATE_LIMIT: usize = 50_000;
const ADDRESS_SEARCH_MAX_POSTINGS_PER_TOKEN: usize = 250_000;
const ADDRESS_SEARCH_PREFIX_MIN_LEN: usize = 4;
const ADDRESS_SEARCH_PREFIX_MAX_LEN: usize = 8;

fn is_numeric_dtype(dtype: &DataType) -> bool {
    matches!(
        dtype,
        DataType::Int8
            | DataType::Int16
            | DataType::Int32
            | DataType::Int64
            | DataType::UInt8
            | DataType::UInt16
            | DataType::UInt32
            | DataType::UInt64
            | DataType::Float32
            | DataType::Float64
            | DataType::Datetime(_, _)
            | DataType::Date
    )
}

fn is_datetime_dtype(dtype: &DataType) -> bool {
    matches!(dtype, DataType::Datetime(_, _) | DataType::Date)
}

#[derive(Clone, Debug)]
struct AddressTermGroup {
    alternatives: Vec<String>,
}

#[derive(Debug)]
struct AddressQuery {
    full_postcode: Option<String>,
    text_groups: Vec<AddressTermGroup>,
    numeric_terms: Vec<String>,
    candidate_terms: Vec<String>,
}

fn tokenize_address_text(text: &str) -> Vec<String> {
    let mut tokens = Vec::new();
    let mut current = String::new();

    for ch in text.chars() {
        if ch.is_ascii_alphanumeric() {
            current.push(ch.to_ascii_lowercase());
        } else if matches!(ch, '\'' | '’' | '`') {
            continue;
        } else if !current.is_empty() {
            tokens.push(std::mem::take(&mut current));
        }
    }

    if !current.is_empty() {
        tokens.push(current);
    }

    tokens
}

fn is_full_postcode_compact(compact: &str) -> bool {
    let bytes = compact.as_bytes();
    let len = bytes.len();
    if !(5..=7).contains(&len) {
        return false;
    }

    let inward = &bytes[len - 3..];
    if !inward[0].is_ascii_digit()
        || !inward[1].is_ascii_alphabetic()
        || !inward[2].is_ascii_alphabetic()
    {
        return false;
    }

    let outward = &bytes[..len - 3];
    if !(2..=4).contains(&outward.len()) {
        return false;
    }

    outward[0].is_ascii_alphabetic()
        && outward.iter().all(u8::is_ascii_alphanumeric)
        && outward.iter().any(u8::is_ascii_digit)
}

fn canonical_postcode_from_compact(compact: &str) -> String {
    let upper = compact.to_ascii_uppercase();
    let split = upper.len() - 3;
    format!("{} {}", &upper[..split], &upper[split..])
}

fn extract_full_postcode(tokens: &[String]) -> Option<(String, Vec<usize>)> {
    for (idx, token) in tokens.iter().enumerate() {
        let compact = token.to_ascii_uppercase();
        if is_full_postcode_compact(&compact) {
            return Some((canonical_postcode_from_compact(&compact), vec![idx]));
        }
    }

    for idx in 0..tokens.len().saturating_sub(1) {
        let compact = format!(
            "{}{}",
            tokens[idx].to_ascii_uppercase(),
            tokens[idx + 1].to_ascii_uppercase()
        );
        if is_full_postcode_compact(&compact) {
            return Some((
                canonical_postcode_from_compact(&compact),
                vec![idx, idx + 1],
            ));
        }
    }

    None
}

fn looks_like_postcode_fragment(token: &str) -> bool {
    (2..=4).contains(&token.len())
        && token
            .chars()
            .next()
            .is_some_and(|ch| ch.is_ascii_alphabetic())
        && token.chars().any(|ch| ch.is_ascii_digit())
        && token.chars().all(|ch| ch.is_ascii_alphanumeric())
}

fn is_numeric_address_token(token: &str) -> bool {
    token.chars().all(|ch| ch.is_ascii_digit())
}

fn address_token_aliases(token: &str) -> Vec<&'static str> {
    match token {
        "apt" => vec!["apt", "apartment"],
        "apartment" => vec!["apartment", "apt"],
        "ave" => vec!["ave", "avenue"],
        "avenue" => vec!["avenue", "ave"],
        "blvd" => vec!["blvd", "boulevard"],
        "boulevard" => vec!["boulevard", "blvd"],
        "cl" => vec!["cl", "close"],
        "close" => vec!["close", "cl"],
        "ct" => vec!["ct", "court"],
        "court" => vec!["court", "ct"],
        "cres" => vec!["cres", "crescent"],
        "crescent" => vec!["crescent", "cres"],
        "dr" => vec!["dr", "drive"],
        "drive" => vec!["drive", "dr"],
        "fl" => vec!["fl", "flat"],
        "flat" => vec!["flat", "fl"],
        "gdns" => vec!["gdns", "gardens", "garden"],
        "garden" => vec!["garden", "gardens", "gdns"],
        "gardens" => vec!["gardens", "garden", "gdns"],
        "hse" => vec!["hse", "house"],
        "house" => vec!["house", "hse"],
        "ln" => vec!["ln", "lane"],
        "lane" => vec!["lane", "ln"],
        "rd" => vec!["rd", "road"],
        "road" => vec!["road", "rd"],
        "sq" => vec!["sq", "square"],
        "square" => vec!["square", "sq"],
        "st" => vec!["st", "street", "saint"],
        "street" => vec!["street", "st"],
        "saint" => vec!["saint", "st"],
        "terr" => vec!["terr", "terrace"],
        "terrace" => vec!["terrace", "terr"],
        _ => Vec::new(),
    }
}

fn is_address_stop_token(token: &str) -> bool {
    matches!(
        token,
        "a" | "an"
            | "and"
            | "apartment"
            | "apt"
            | "avenue"
            | "ave"
            | "block"
            | "building"
            | "bungalow"
            | "close"
            | "cl"
            | "court"
            | "ct"
            | "cres"
            | "crescent"
            | "drive"
            | "dr"
            | "estate"
            | "flat"
            | "fl"
            | "floor"
            | "garden"
            | "gardens"
            | "gdns"
            | "grove"
            | "house"
            | "hse"
            | "lane"
            | "ln"
            | "lodge"
            | "mansions"
            | "mews"
            | "of"
            | "park"
            | "place"
            | "road"
            | "rd"
            | "room"
            | "row"
            | "saint"
            | "sq"
            | "square"
            | "st"
            | "street"
            | "terr"
            | "terrace"
            | "the"
            | "unit"
            | "view"
            | "villas"
            | "walk"
            | "way"
            | "yard"
    )
}

fn address_term_group(token: &str) -> Option<AddressTermGroup> {
    if token.len() < 3 || is_numeric_address_token(token) || looks_like_postcode_fragment(token) {
        return None;
    }

    let mut alternatives = Vec::new();
    alternatives.push(token.to_string());
    for alias in address_token_aliases(token) {
        if !alternatives.iter().any(|existing| existing == alias) {
            alternatives.push(alias.to_string());
        }
    }

    if alternatives
        .iter()
        .all(|alternative| is_address_stop_token(alternative))
    {
        return None;
    }

    Some(AddressTermGroup { alternatives })
}

fn address_search_tokens(text: &str) -> Vec<String> {
    let mut tokens: Vec<String> = tokenize_address_text(text)
        .into_iter()
        .filter(|token| is_address_search_token(token))
        .collect();
    tokens.sort_unstable();
    tokens.dedup();
    tokens
}

fn is_address_search_token(token: &str) -> bool {
    if looks_like_postcode_fragment(token) {
        return false;
    }

    if is_numeric_address_token(token) {
        return true;
    }

    if token.chars().any(|ch| ch.is_ascii_digit()) {
        return token.len() >= 2;
    }

    token.len() >= 3
}

fn is_address_candidate_token(token: &str) -> bool {
    !is_numeric_address_token(token)
        && !looks_like_postcode_fragment(token)
        && (token.chars().any(|ch| ch.is_ascii_digit())
            || (token.len() >= 3 && !is_address_stop_token(token)))
}

fn address_prefix_key(term: &str) -> &str {
    if term.len() > ADDRESS_SEARCH_PREFIX_MAX_LEN {
        &term[..ADDRESS_SEARCH_PREFIX_MAX_LEN]
    } else {
        term
    }
}

fn build_address_prefix_index(
    address_token_index: &FxHashMap<String, Vec<u32>>,
) -> FxHashMap<String, Vec<String>> {
    let mut prefix_index: FxHashMap<String, Vec<String>> = FxHashMap::default();

    for token in address_token_index.keys() {
        let max_prefix_len = token.len().min(ADDRESS_SEARCH_PREFIX_MAX_LEN);
        for prefix_len in ADDRESS_SEARCH_PREFIX_MIN_LEN..=max_prefix_len {
            prefix_index
                .entry(token[..prefix_len].to_string())
                .or_default()
                .push(token.clone());
        }
    }

    for tokens in prefix_index.values_mut() {
        tokens.sort_unstable();
        tokens.dedup();
    }

    prefix_index
}

fn parse_address_query(query: &str) -> AddressQuery {
    let tokens = tokenize_address_text(query);
    let (full_postcode, postcode_token_indices) = extract_full_postcode(&tokens)
        .map(|(postcode, indices)| (Some(postcode), indices))
        .unwrap_or((None, Vec::new()));

    let skip_postcode_tokens: FxHashSet<usize> = postcode_token_indices.into_iter().collect();
    let mut text_groups = Vec::new();
    let mut numeric_terms = Vec::new();
    let mut candidate_terms = Vec::new();

    for (idx, token) in tokens.iter().enumerate() {
        if skip_postcode_tokens.contains(&idx) || looks_like_postcode_fragment(token) {
            continue;
        }

        if is_numeric_address_token(token) {
            numeric_terms.push(token.clone());
            continue;
        }

        if let Some(group) = address_term_group(token) {
            for alternative in &group.alternatives {
                if !is_address_stop_token(alternative)
                    && !candidate_terms.iter().any(|term| term == alternative)
                {
                    candidate_terms.push(alternative.clone());
                }
            }
            text_groups.push(group);
        } else if token.chars().any(|ch| ch.is_ascii_digit()) && token.len() >= 2 {
            numeric_terms.push(token.clone());
            if !candidate_terms.iter().any(|term| term == token) {
                candidate_terms.push(token.clone());
            }
        }
    }

    text_groups.dedup_by(|left, right| left.alternatives == right.alternatives);
    numeric_terms.sort_unstable();
    numeric_terms.dedup();

    AddressQuery {
        full_postcode,
        text_groups,
        numeric_terms,
        candidate_terms,
    }
}

fn token_matches_query_term(token: &str, query_term: &str) -> bool {
    token == query_term || (query_term.len() >= 3 && token.starts_with(query_term))
}

fn token_matches_numeric_term(token: &str, query_term: &str) -> bool {
    token == query_term || token.starts_with(query_term)
}

#[cfg(test)]
fn address_tokens_match_group(tokens: &[String], group: &AddressTermGroup) -> bool {
    group.alternatives.iter().any(|alternative| {
        tokens
            .iter()
            .any(|token| token_matches_query_term(token, alternative))
    })
}

/// Histogram with outlier buckets at the edges.
/// - Bin 0: [min, p1) — low outliers
/// - Bins 1 to n-2: [p1, p99) — main distribution, evenly divided
/// - Bin n-1: [p99, max] — high outliers
#[derive(Serialize, Clone)]
pub struct Histogram {
    pub min: f32,
    pub max: f32,
    /// 1st percentile (left edge of main distribution)
    pub p1: f32,
    /// 99th percentile (right edge of main distribution)
    pub p99: f32,
    pub counts: Vec<u64>,
}

impl Histogram {
    /// Return the bin index for a given value using the outlier-bracket layout.
    #[cfg(test)]
    pub fn bin_for_value(&self, value: f32) -> usize {
        let num_bins = self.counts.len();
        if value < self.p1 {
            0
        } else if value >= self.p99 {
            num_bins - 1
        } else {
            let middle_bins = num_bins.saturating_sub(2);
            if middle_bins > 0 && self.p99 > self.p1 {
                let width = (self.p99 - self.p1) / middle_bins as f32;
                let middle_bin = ((value - self.p1) / width) as usize;
                (1 + middle_bin).min(num_bins - 2)
            } else {
                num_bins / 2
            }
        }
    }

    /// Width of a single middle bin (bins 1..n-2).
    #[cfg(test)]
    pub fn middle_bin_width(&self) -> f32 {
        let middle_bins = self.counts.len().saturating_sub(2);
        if middle_bins > 0 && self.p99 > self.p1 {
            (self.p99 - self.p1) / middle_bins as f32
        } else {
            0.0
        }
    }
}

pub struct FeatureStats {
    pub slider_min: f32,
    pub slider_max: f32,
    pub histogram: Histogram,
}

#[derive(Serialize, Clone)]
pub struct RenovationEvent {
    pub year: i32,
    pub event: String,
}

/// Lightweight reference to quantization parameters for decoding u16 feature data.
pub struct QuantRef<'a> {
    pub dequant_a: &'a [f32],
    pub quant_min: &'a [f32],
    pub quant_range: &'a [f32],
    pub num_numeric: usize,
}

impl QuantRef<'_> {
    /// Decode a raw u16 value back to f32.
    #[inline]
    pub fn decode(&self, feat_idx: usize, raw: u16) -> f32 {
        if raw == NAN_U16 {
            return f32::NAN;
        }
        if feat_idx >= self.num_numeric {
            raw as f32
        } else {
            raw as f32 * self.dequant_a[feat_idx] + self.quant_min[feat_idx]
        }
    }

    /// Encode a filter minimum bound to u16 (floors to include boundary values).
    #[inline]
    pub fn encode_min(&self, feat_idx: usize, value: f32) -> u16 {
        if !value.is_finite() || self.quant_range[feat_idx] == 0.0 {
            return 0;
        }
        let norm = (value - self.quant_min[feat_idx]) / self.quant_range[feat_idx];
        (norm * QUANT_SCALE).floor().clamp(0.0, QUANT_SCALE) as u16
    }

    /// Encode a filter maximum bound to u16 (ceils to include boundary values).
    #[inline]
    pub fn encode_max(&self, feat_idx: usize, value: f32) -> u16 {
        if !value.is_finite() || self.quant_range[feat_idx] == 0.0 {
            return QUANT_SCALE as u16;
        }
        let norm = (value - self.quant_min[feat_idx]) / self.quant_range[feat_idx];
        (norm * QUANT_SCALE).ceil().clamp(0.0, QUANT_SCALE) as u16
    }
}

pub struct PropertyData {
    pub lat: Vec<f32>,
    pub lon: Vec<f32>,
    pub feature_names: Vec<String>,
    pub num_features: usize,
    /// Number of numeric features (enum features start at this index).
    pub num_numeric: usize,
    /// Row-major flat array: feature_data[row * num_features + feat_idx].
    /// Quantized to u16. NaN sentinel = u16::MAX (65535).
    /// Numeric features: encoded via (val - min) / range * 65534.
    /// Enum features: stored directly as u16 cast of the f32 index.
    pub feature_data: Vec<u16>,
    /// Per-feature: range / QUANT_SCALE for fast decode.
    dequant_a: Vec<f32>,
    /// Per-feature: minimum value (offset for dequantization).
    quant_min: Vec<f32>,
    /// Per-feature: max - min (for encoding filter bounds).
    quant_range: Vec<f32>,
    pub feature_stats: Vec<FeatureStats>,
    /// Unquantized last sale price used by the price-history chart.
    last_known_price_raw: Vec<f32>,
    /// Contiguous buffer holding all address strings end-to-end.
    address_buffer: String,
    /// Byte offset into `address_buffer` where each row's address starts.
    address_offsets: Vec<u32>,
    /// Length in bytes of each row's address.
    address_lengths: Vec<u16>,
    /// Interned postcodes: reader is thread-safe, keys index into it.
    postcode_interner: lasso::RodeoReader,
    postcode_keys: Vec<lasso::Spur>,
    /// Rows for each postcode, keyed by the interned postcode key.
    postcode_row_index: FxHashMap<lasso::Spur, Vec<u32>>,
    /// Inverted index from address tokens to property rows.
    address_token_index: FxHashMap<String, Vec<u32>>,
    /// Prefix lookup from typed address-token prefix to indexed full address tokens.
    address_prefix_index: FxHashMap<String, Vec<String>>,
    /// Interned normalized address-search tokens used for per-row scoring.
    address_search_interner: lasso::RodeoReader,
    /// Flat per-row normalized address-search token keys.
    address_search_token_keys: Vec<lasso::Spur>,
    /// Offset into `address_search_token_keys` for each row.
    address_search_token_offsets: Vec<u32>,
    /// Number of normalized address-search token keys for each row.
    address_search_token_lengths: Vec<u16>,
    /// For enum features: maps feature index to list of possible string values.
    /// Index in values list corresponds to the u16 value stored in feature_data.
    pub enum_values: rustc_hash::FxHashMap<usize, Vec<String>>,
    /// For enum features: maps feature index to per-value global counts (same order as enum_values).
    pub enum_counts: rustc_hash::FxHashMap<usize, Vec<u64>>,
    /// Per-row flag: true = construction date is approximate (from EPC band),
    /// false = exact (from new-build transaction date).
    /// Bit-packed: byte `row / 8`, bit `row % 8`. 8x smaller than Vec<bool>.
    approx_build_date_bits: Vec<u8>,
    /// Per-row renovation events. Keyed by (permuted) row index.
    /// Only rows with events are present in the map.
    renovation_history: FxHashMap<u32, Vec<RenovationEvent>>,
    property_sub_type: FxHashMap<u32, String>,
    price_qualifier: FxHashMap<u32, String>,
}

impl PropertyData {
    /// Get the address string for a given row.
    pub fn address(&self, row: usize) -> &str {
        let offset = self.address_offsets[row] as usize;
        let length = self.address_lengths[row] as usize;
        &self.address_buffer[offset..offset + length]
    }

    /// Get the postcode string for a given row.
    pub fn postcode(&self, row: usize) -> &str {
        self.postcode_interner.resolve(&self.postcode_keys[row])
    }

    /// Get postcode components for field-level borrowing (avoids conflicting borrows with feature_data).
    pub fn postcode_parts(&self) -> (&lasso::RodeoReader, &[lasso::Spur]) {
        (&self.postcode_interner, &self.postcode_keys)
    }

    fn row_address_search_tokens(&self, row: usize) -> &[lasso::Spur] {
        let offset = self.address_search_token_offsets[row] as usize;
        let length = self.address_search_token_lengths[row] as usize;
        &self.address_search_token_keys[offset..offset + length]
    }

    /// Search individual property addresses. Full postcode queries use a direct row index;
    /// free-text queries use a small inverted index over distinctive address tokens.
    pub fn search_addresses(&self, query: &str, limit: usize) -> Vec<usize> {
        if limit == 0 {
            return Vec::new();
        }

        let parsed = parse_address_query(query);
        if parsed.full_postcode.is_none()
            && parsed.text_groups.is_empty()
            && parsed.numeric_terms.is_empty()
        {
            return Vec::new();
        }

        let candidate_rows: Vec<u32> = if let Some(postcode) = parsed.full_postcode.as_deref() {
            self.postcode_interner
                .get(postcode)
                .and_then(|key| self.postcode_row_index.get(&key))
                .map(|rows| rows.to_vec())
                .unwrap_or_default()
        } else if let Some(rows) = self.best_address_token_rows(&parsed.candidate_terms) {
            rows.iter()
                .take(ADDRESS_SEARCH_CANDIDATE_LIMIT)
                .copied()
                .collect()
        } else {
            Vec::new()
        };

        if candidate_rows.is_empty() {
            return Vec::new();
        }

        let mut scored: Vec<(i32, usize, usize)> = candidate_rows
            .into_iter()
            .filter_map(|row| {
                let row = row as usize;
                self.address_match_score(row, &parsed)
                    .map(|score| (score, self.address(row).len(), row))
            })
            .collect();

        scored.sort_unstable_by(|left, right| {
            right
                .0
                .cmp(&left.0)
                .then(left.1.cmp(&right.1))
                .then(left.2.cmp(&right.2))
        });

        let mut seen = FxHashSet::default();
        let mut results = Vec::with_capacity(limit);
        for (_, _, row) in scored {
            let address = self.address(row).trim();
            if address.is_empty() {
                continue;
            }
            let key = format!("{}\n{}", address.to_ascii_lowercase(), self.postcode(row));
            if !seen.insert(key) {
                continue;
            }
            results.push(row);
            if results.len() == limit {
                break;
            }
        }

        results
    }

    fn best_address_token_rows(&self, terms: &[String]) -> Option<&[u32]> {
        let mut best: Option<&[u32]> = None;

        for term in terms {
            if let Some(rows) = self.address_token_index.get(term) {
                if best.is_none_or(|current| rows.len() < current.len()) {
                    best = Some(rows.as_slice());
                }
                continue;
            }

            if term.len() < 4 {
                continue;
            }

            if let Some(tokens) = self.address_prefix_index.get(address_prefix_key(term)) {
                for token in tokens {
                    if !token.starts_with(term) {
                        continue;
                    }
                    if let Some(rows) = self.address_token_index.get(token) {
                        if best.is_none_or(|current| rows.len() < current.len()) {
                            best = Some(rows.as_slice());
                        }
                    }
                }
            }
        }

        best
    }

    fn address_match_score(&self, row: usize, parsed: &AddressQuery) -> Option<i32> {
        if self.address(row).trim().is_empty() {
            return None;
        }

        let tokens = self.row_address_search_tokens(row);
        if parsed
            .text_groups
            .iter()
            .any(|group| !self.address_tokens_match_group(tokens, group))
        {
            return None;
        }

        let numeric_matches = parsed
            .numeric_terms
            .iter()
            .filter(|term| {
                tokens.iter().any(|token| {
                    token_matches_numeric_term(self.address_search_interner.resolve(token), term)
                })
            })
            .count();

        if !parsed.numeric_terms.is_empty() && numeric_matches == 0 {
            return None;
        }

        let mut score = 0;
        if parsed.full_postcode.is_some() {
            score += 1_000;
        }
        score += (parsed.text_groups.len() as i32) * 200;
        score += (numeric_matches as i32) * 90;
        if numeric_matches == parsed.numeric_terms.len() && numeric_matches > 0 {
            score += 50;
        }

        Some(score)
    }

    fn address_tokens_match_group(&self, tokens: &[lasso::Spur], group: &AddressTermGroup) -> bool {
        group.alternatives.iter().any(|alternative| {
            tokens.iter().any(|token| {
                token_matches_query_term(self.address_search_interner.resolve(token), alternative)
            })
        })
    }

    /// Get the is_approx_build_date flag for a given row (bit-packed).
    pub fn is_approx_build_date(&self, row: usize) -> bool {
        let byte = self.approx_build_date_bits[row / 8];
        byte & (1 << (row % 8)) != 0
    }

    /// Get renovation events for a given row (empty slice if none).
    pub fn renovation_history(&self, row: usize) -> &[RenovationEvent] {
        self.renovation_history
            .get(&(row as u32))
            .map(|v| v.as_slice())
            .unwrap_or(&[])
    }

    /// Get property sub-type for a given row.
    pub fn property_sub_type(&self, row: usize) -> Option<&str> {
        self.property_sub_type
            .get(&(row as u32))
            .map(String::as_str)
    }

    /// Get price qualifier for a given row.
    pub fn price_qualifier(&self, row: usize) -> Option<&str> {
        self.price_qualifier.get(&(row as u32)).map(String::as_str)
    }

    /// Get the unquantized last sale price for charting.
    #[inline]
    pub fn last_known_price_raw(&self, row: usize) -> f32 {
        self.last_known_price_raw[row]
    }

    /// Decode a single feature value from quantized u16 storage.
    #[inline]
    pub fn get_feature(&self, row: usize, feat_idx: usize) -> f32 {
        let raw = self.feature_data[row * self.num_features + feat_idx];
        if raw == NAN_U16 {
            return f32::NAN;
        }
        if feat_idx >= self.num_numeric {
            raw as f32
        } else {
            raw as f32 * self.dequant_a[feat_idx] + self.quant_min[feat_idx]
        }
    }

    /// Get a QuantRef for passing to aggregation/filter functions.
    pub fn quant_ref(&self) -> QuantRef<'_> {
        QuantRef {
            dequant_a: &self.dequant_a,
            quant_min: &self.quant_min,
            quant_range: &self.quant_range,
            num_numeric: self.num_numeric,
        }
    }
}

/// Compute a percentile from a uniformly-binned histogram.
/// `prelim_counts` are uniform bins over [min, max].
fn percentile_from_uniform_histogram(
    count: usize,
    min: f32,
    max: f32,
    prelim_counts: &[u64],
    percentile: f32,
) -> f32 {
    if count == 0 || prelim_counts.is_empty() {
        return min;
    }
    let target = (count as f64 * percentile as f64 / 100.0).floor() as u64;
    let bin_width = (max - min) / prelim_counts.len() as f32;
    let mut cumulative = 0u64;
    for (i, &bin_count) in prelim_counts.iter().enumerate() {
        let prev_cumulative = cumulative;
        cumulative += bin_count;
        if cumulative > target {
            // Interpolate within this bin
            let bin_start = min + i as f32 * bin_width;
            let fraction = if bin_count > 0 {
                (target - prev_cumulative) as f32 / bin_count as f32
            } else {
                0.0
            };
            return bin_start + fraction * bin_width;
        }
    }
    max
}

/// Build a histogram and compute slider bounds based on the feature's Bounds config.
pub fn compute_feature_stats(vals: &[f32], bounds: &Bounds, integer_bins: bool) -> FeatureStats {
    // Single pass: min, max, count (skipping NaN and infinity)
    let mut min = f32::INFINITY;
    let mut max = f32::NEG_INFINITY;
    let mut count = 0usize;
    for &value in vals {
        if value.is_finite() {
            if value < min {
                min = value;
            }
            if value > max {
                max = value;
            }
            count += 1;
        }
    }

    if count == 0 {
        let (slider_min, slider_max) = match bounds {
            Bounds::Fixed {
                min: fmin,
                max: fmax,
            } => (*fmin, *fmax),
            Bounds::Percentile { .. } => (0.0, 0.0),
        };
        return FeatureStats {
            slider_min,
            slider_max,
            histogram: Histogram {
                min: 0.0,
                max: 0.0,
                p1: 0.0,
                p99: 0.0,
                counts: vec![0; HISTOGRAM_BINS],
            },
        };
    }

    // Build preliminary histogram with uniform bins to compute percentiles
    // Use full HISTOGRAM_BINS for percentile precision
    let range = if max == min { 1.0 } else { max - min };
    let prelim_max = min + range * (1.0 + 1e-6);
    let prelim_bin_width = (prelim_max - min) / HISTOGRAM_BINS as f32;

    let mut prelim_counts = vec![0u64; HISTOGRAM_BINS];
    for &value in vals {
        if value.is_finite() {
            let bin = ((value - min) / prelim_bin_width) as usize;
            prelim_counts[bin.min(HISTOGRAM_BINS - 1)] += 1;
        }
    }

    // Compute p1 and p99 from preliminary histogram
    let mut p1 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 1.0);
    let mut p99 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 99.0);

    // Iterative refinement for outlier-dominated distributions.
    // When extreme outliers (e.g. 317M sqm from web scraping) dominate the range,
    // the uniform histogram puts all real data in one bin, making percentile
    // estimation useless. Zoom into the estimated data region and recompute.
    let mut refined_counts = prelim_counts;
    let mut refined_count = count;
    let mut refined_min = min;
    let mut refined_max = max;
    for _ in 0..3 {
        let iqr = p99 - p1;
        if iqr <= 0.0 || (refined_max - refined_min) <= 5.0 * iqr {
            break;
        }
        let new_min = (p1 - iqr).max(min);
        let new_max = p99 + iqr;
        if new_max <= new_min {
            break;
        }
        let bin_width = (new_max - new_min) / HISTOGRAM_BINS as f32;
        let mut counts = vec![0u64; HISTOGRAM_BINS];
        let mut cnt = 0usize;
        for &value in vals {
            if value.is_finite() && value >= new_min && value <= new_max {
                let bin = ((value - new_min) / bin_width) as usize;
                counts[bin.min(HISTOGRAM_BINS - 1)] += 1;
                cnt += 1;
            }
        }
        if cnt == 0 {
            break;
        }
        p1 = percentile_from_uniform_histogram(cnt, new_min, new_max, &counts, 1.0);
        p99 = percentile_from_uniform_histogram(cnt, new_min, new_max, &counts, 99.0);
        refined_counts = counts;
        refined_count = cnt;
        refined_min = new_min;
        refined_max = new_max;
    }

    // For integer-binned features, snap p1/p99 to integer boundaries
    // so each middle bin is exactly 1 unit wide.
    if integer_bins {
        p1 = p1.floor();
        p99 = p99.ceil();
    }

    // Determine number of histogram bins
    let num_bins = if integer_bins && p99 > p1 {
        // One middle bin per integer + 2 outlier bins
        (p99 - p1) as usize + 2
    } else {
        // Count unique values within the p1–p99 range to cap histogram bins.
        // Using the full-range cardinality would over-allocate bins when outliers
        // inflate it (e.g. bedrooms: 1–137 unique values but only ~10 within p1–p99).
        let cardinality = {
            let mut unique_set = rustc_hash::FxHashSet::default();
            for &val in vals {
                if val.is_finite() && val >= p1 && val <= p99 {
                    unique_set.insert(val.to_bits());
                }
            }
            unique_set.len()
        };
        HISTOGRAM_BINS.min(cardinality).max(3)
    };

    // Build final histogram with outlier bins at edges:
    // - Bin 0: [min, p1) — low outliers
    // - Bins 1 to n-2: [p1, p99) — main distribution, evenly divided
    // - Bin n-1: [p99, max] — high outliers
    let mut counts = vec![0u64; num_bins];
    let middle_bins = num_bins.saturating_sub(2);
    let middle_width = if middle_bins > 0 && p99 > p1 {
        (p99 - p1) / middle_bins as f32
    } else {
        0.0
    };

    for &value in vals {
        if value.is_finite() {
            let bin = if value < p1 {
                0 // Low outlier bin
            } else if value >= p99 {
                num_bins - 1 // High outlier bin
            } else if middle_width > 0.0 {
                // Middle bins (1 to n-2)
                let middle_bin = ((value - p1) / middle_width) as usize;
                (1 + middle_bin).min(num_bins - 2)
            } else {
                num_bins / 2 // Fallback if p1 == p99
            };
            counts[bin] += 1;
        }
    }

    let histogram = Histogram {
        min: refined_min,
        max: refined_max,
        p1,
        p99,
        counts,
    };

    // Compute slider bounds (use refined histogram for accurate percentiles)
    let (slider_min, slider_max) = match bounds {
        Bounds::Fixed {
            min: fmin,
            max: fmax,
        } => (*fmin, *fmax),
        Bounds::Percentile { low, high } => {
            let p_low = percentile_from_uniform_histogram(
                refined_count,
                refined_min,
                refined_max,
                &refined_counts,
                *low as f32,
            );
            let p_high = percentile_from_uniform_histogram(
                refined_count,
                refined_min,
                refined_max,
                &refined_counts,
                *high as f32,
            );
            (p_low, p_high)
        }
    };

    FeatureStats {
        slider_min,
        slider_max,
        histogram,
    }
}

fn column_to_f32_vec(column: &Column) -> anyhow::Result<Vec<f32>> {
    let float_series = column
        .cast(&DataType::Float32)
        .context("Failed to cast column to Float32")?;
    let chunked = float_series
        .f32()
        .context("Failed to get f32 chunked array")?;
    Ok(chunked
        .into_iter()
        .map(|value| value.unwrap_or(f32::NAN))
        .collect())
}

/// Precompute H3 cell IDs for all rows at the maximum resolution only.
/// Parent cells for lower resolutions are derived on the fly via `CellIndex::parent()`.
pub fn precompute_h3(lat: &[f32], lon: &[f32]) -> anyhow::Result<Vec<u64>> {
    let res = H3_PRECOMPUTE_MAX;
    tracing::info!("Precomputing H3 cells at resolution {}", res);

    let h3_res =
        h3o::Resolution::try_from(res).with_context(|| format!("Invalid H3 resolution: {res}"))?;

    let cells: Vec<u64> = lat
        .par_iter()
        .zip(lon.par_iter())
        .enumerate()
        .map(|(i, (&latitude, &longitude))| {
            let coord = h3o::LatLng::new(latitude as f64, longitude as f64).unwrap_or_else(|err| {
                panic!(
                    "Invalid coordinates at row {}: lat={}, lon={}: {}",
                    i, latitude, longitude, err
                )
            });
            u64::from(coord.to_cell(h3_res))
        })
        .collect();

    tracing::info!("H3 precomputation complete ({} cells)", cells.len());
    Ok(cells)
}

impl PropertyData {
    pub fn load(properties_path: &Path, postcode_features_path: &Path) -> anyhow::Result<Self> {
        // Load postcode.parquet
        tracing::info!(
            "Loading postcode features from {:?}",
            postcode_features_path
        );
        let postcode_df = LazyFrame::scan_parquet(postcode_features_path, Default::default())
            .context("Failed to scan postcode parquet")?
            .collect()
            .context("Failed to read postcode parquet")?;
        tracing::info!(rows = postcode_df.height(), "Postcode features loaded");

        // Load properties.parquet and join with postcode data for lat/lon + area features
        tracing::info!("Loading properties from {:?}", properties_path);
        let properties_lf = LazyFrame::scan_parquet(properties_path, Default::default())
            .context("Failed to scan properties parquet")?;
        let combined = properties_lf
            .join(
                postcode_df.clone().lazy(),
                [col("Postcode")],
                [col("Postcode")],
                JoinArgs::new(JoinType::Left),
            )
            .collect()
            .context("Failed to join properties with postcodes")?;
        let total_rows = combined.height();
        tracing::info!(rows = total_rows, "Properties joined with postcodes");

        // Get configured feature/enum names in config order
        let numeric_names = features::all_numeric_feature_names();
        let enum_names = features::all_enum_feature_names();

        let schema = combined.schema();

        for name in &numeric_names {
            match schema.get(name) {
                Some(dtype) if is_numeric_dtype(dtype) => {}
                Some(dtype) => bail!(
                    "Configured numeric feature '{}' has non-numeric type {:?}",
                    name,
                    dtype
                ),
                None => bail!(
                    "Configured numeric feature '{}' not found in combined schema",
                    name
                ),
            }
        }
        for name in &enum_names {
            match schema.get(name) {
                Some(dtype) if matches!(dtype, DataType::String) || dtype.is_categorical() => {}
                Some(dtype) => bail!(
                    "Configured enum feature '{}' has unexpected type {:?}",
                    name,
                    dtype
                ),
                None => bail!(
                    "Configured enum feature '{}' not found in combined schema",
                    name
                ),
            }
        }

        // Combine numeric and enum feature names (numeric first, then enum)
        let feature_names: Vec<String> = numeric_names
            .iter()
            .chain(enum_names.iter())
            .map(|name| name.to_string())
            .collect();
        let num_features = feature_names.len();
        let num_numeric = numeric_names.len();
        tracing::info!(
            numeric = num_numeric,
            enums = enum_names.len(),
            total = num_features,
            "Feature columns from config"
        );

        // Build select expressions for the combined DataFrame
        let mut select_exprs: Vec<polars::prelude::Expr> = vec![];
        select_exprs.push(col("lat").cast(DataType::Float32));
        select_exprs.push(col("lon").cast(DataType::Float32));

        // Select numeric features as Float32 (datetime columns → fractional year)
        for &name in &numeric_names {
            if is_datetime_dtype(schema.get(name).unwrap()) {
                select_exprs.push(
                    (col(name).dt().year().cast(DataType::Float32)
                        + (col(name).dt().month().cast(DataType::Float32) - lit(1.0f32))
                            / lit(12.0f32))
                    .alias(name),
                );
            } else {
                select_exprs.push(col(name).cast(DataType::Float32));
            }
        }

        // String columns for address/postcode and property metadata
        for &string_col_name in &[
            "Address per Property Register",
            "Address per EPC",
            "Postcode",
            "Property sub-type",
            "Price qualifier",
        ] {
            if schema.get(string_col_name).is_some() {
                select_exprs.push(col(string_col_name).cast(DataType::String));
            }
        }

        // Enum features as String
        for &name in &enum_names {
            select_exprs.push(col(name).cast(DataType::String));
        }

        // Optional columns
        let has_approx_col = schema.get("Is construction date approximate").is_some();
        if has_approx_col {
            select_exprs.push(col("Is construction date approximate").cast(DataType::Float32));
        }
        let has_renovation_history = schema.get("renovation_history").is_some();
        if has_renovation_history {
            select_exprs.push(col("renovation_history"));
        }
        let df = combined
            .lazy()
            .filter(col("lat").is_not_null().and(col("lon").is_not_null()))
            .select(select_exprs)
            .collect()
            .context("Failed to select columns from combined data")?;

        let row_count = df.height();
        if row_count == 0 {
            bail!("No property rows have usable coordinates after joining postcode data");
        }
        let dropped_coordinate_rows = total_rows.saturating_sub(row_count);
        if dropped_coordinate_rows > 0 {
            tracing::warn!(
                rows = dropped_coordinate_rows,
                "Dropped properties with missing postcode coordinates"
            );
        }
        tracing::info!(rows = row_count, "Combined data selected");

        let lat_series = df
            .column("lat")
            .context("Missing 'lat' column")?
            .cast(&DataType::Float32)
            .context("Failed to cast 'lat' to Float32")?;
        let lat: Vec<f32> = lat_series
            .f32()
            .context("Failed to read 'lat' as f32")?
            .into_iter()
            .map(|value| value.context("Missing 'lat' value after coordinate filter"))
            .collect::<anyhow::Result<Vec<_>>>()?;

        let lon_series = df
            .column("lon")
            .context("Missing 'lon' column")?
            .cast(&DataType::Float32)
            .context("Failed to cast 'lon' to Float32")?;
        let lon: Vec<f32> = lon_series
            .f32()
            .context("Failed to read 'lon' as f32")?
            .into_iter()
            .map(|value| value.context("Missing 'lon' value after coordinate filter"))
            .collect::<anyhow::Result<Vec<_>>>()?;

        for (row, (&latitude, &longitude)) in lat.iter().zip(&lon).enumerate() {
            if !(-90.0..=90.0).contains(&latitude) || !(-180.0..=180.0).contains(&longitude) {
                bail!("Invalid coordinates at row {row}: lat={latitude}, lon={longitude}");
            }
        }

        tracing::info!("Extracting numeric feature columns");
        let numeric_col_major: Vec<Vec<f32>> = numeric_names
            .par_iter()
            .map(|name| {
                let column = df
                    .column(name)
                    .with_context(|| format!("Missing feature column '{name}'"))?;
                column_to_f32_vec(column)
            })
            .collect::<anyhow::Result<Vec<_>>>()?;

        tracing::info!("Computing histograms for numeric features");
        let numeric_feature_stats: Vec<FeatureStats> = numeric_col_major
            .par_iter()
            .enumerate()
            .map(|(feat_index, vals)| {
                let name = numeric_names[feat_index];
                let bounds = features::bounds_for(name)
                    .with_context(|| format!("No bounds config for feature '{}'", name))?;
                let stats = compute_feature_stats(vals, bounds, features::has_integer_bins(name));
                tracing::debug!(
                    feature = %name,
                    slider_min = format_args!("{:.2}", stats.slider_min),
                    slider_max = format_args!("{:.2}", stats.slider_max),
                    bins = stats.histogram.counts.len(),
                    "Feature stats"
                );
                Ok(stats)
            })
            .collect::<anyhow::Result<Vec<_>>>()?;

        // Compute quantization parameters from feature stats (numeric features).
        // For features with Fixed bounds, use those bounds so the full configured range
        // is representable — the histogram refinement can narrow min/max to exclude
        // "outliers" that are actually valid data (e.g. ethnicity percentages).
        // For Percentile-bounded features, use the (possibly refined) histogram range
        // so extreme outliers don't destroy precision for the main distribution.
        let mut quant_min = Vec::with_capacity(num_features);
        let mut quant_range = Vec::with_capacity(num_features);
        for (feat_idx, stats) in numeric_feature_stats.iter().enumerate() {
            let (min, max) = match features::bounds_for(numeric_names[feat_idx]) {
                Some(Bounds::Fixed { min, max }) => (*min, *max),
                _ => (stats.histogram.min, stats.histogram.max),
            };
            quant_min.push(min);
            quant_range.push(if max > min { max - min } else { 0.0 });
        }

        tracing::info!("Extracting string columns");
        let extract_string_col = |df: &DataFrame, name: &str| -> anyhow::Result<Vec<String>> {
            let column = df
                .column(name)
                .with_context(|| format!("Required column '{name}' not found in parquet"))?;
            let string_column = column
                .str()
                .with_context(|| format!("Column '{name}' is not a string column"))?;
            Ok(string_column
                .into_iter()
                .map(|value| value.unwrap_or("").to_string())
                .collect())
        };

        let address_raw = extract_string_col(&df, "Address per Property Register")?;
        let postcode_raw = extract_string_col(&df, "Postcode")?;

        // Extract optional string columns
        let extract_optional_string_col =
            |df: &DataFrame, name: &str| -> anyhow::Result<Vec<Option<String>>> {
                if let Ok(column) = df.column(name) {
                    let string_column = column
                        .str()
                        .with_context(|| format!("Column '{name}' is not a string column"))?;
                    Ok(string_column
                        .into_iter()
                        .map(|value| {
                            value.and_then(|s| {
                                let trimmed = s.trim();
                                if trimmed.is_empty() {
                                    None
                                } else {
                                    Some(trimmed.to_string())
                                }
                            })
                        })
                        .collect())
                } else {
                    Ok(vec![None; row_count])
                }
            };

        let property_sub_type_raw = extract_optional_string_col(&df, "Property sub-type")?;
        let price_qualifier_raw = extract_optional_string_col(&df, "Price qualifier")?;

        tracing::info!("Building enum features");
        // enum_col_major: Vec<(values_list, encoded_as_f32)>
        let enum_col_major: Vec<(Vec<String>, Vec<f32>)> = enum_names
            .par_iter()
            .filter_map(|&name| {
                let column_data = df.column(name).ok()?;
                let string_column = column_data.str().ok()?;
                let unique_set: std::collections::HashSet<String> = string_column
                    .into_iter()
                    .filter_map(|value| {
                        let text = value.unwrap_or("");
                        if text.is_empty() {
                            None
                        } else {
                            Some(text.to_string())
                        }
                    })
                    .collect();

                // Use configured order if available, otherwise alphabetical
                let unique: Vec<String> = if let Some(order) = features::order_for(name) {
                    let mut ordered: Vec<String> = Vec::new();
                    for &ordered_value in order {
                        if unique_set.contains(ordered_value) {
                            ordered.push(ordered_value.to_string());
                        }
                    }
                    // Append any values not in the configured order, alphabetically
                    // Use HashSet for O(1) contains instead of O(n) slice search
                    let order_set: rustc_hash::FxHashSet<&str> = order.iter().copied().collect();
                    let mut remainder: Vec<String> = unique_set
                        .iter()
                        .filter(|value| !order_set.contains(value.as_str()))
                        .cloned()
                        .collect();
                    remainder.sort();
                    ordered.extend(remainder);
                    ordered
                } else {
                    let mut sorted: Vec<String> = unique_set.into_iter().collect();
                    sorted.sort();
                    sorted
                };

                let value_to_idx: std::collections::HashMap<&str, f32> = unique
                    .iter()
                    .enumerate()
                    .map(|(index, value)| (value.as_str(), index as f32))
                    .collect();

                let encoded: Vec<f32> = string_column
                    .into_iter()
                    .map(|value| {
                        let text = value.unwrap_or("");
                        if text.is_empty() {
                            f32::NAN
                        } else {
                            *value_to_idx.get(text).unwrap_or(&f32::NAN)
                        }
                    })
                    .collect();

                tracing::debug!(column = %name, unique_values = unique.len(), "Enum feature encoded as f32");
                Some((unique, encoded))
            })
            .collect();

        // Extract is_approx_build_date: 0.0 = exact, anything else (1.0/NaN) = approximate
        let is_approx_build_date_raw: Vec<bool> = if has_approx_col {
            let column_data = df
                .column("Is construction date approximate")
                .context("Missing 'Is construction date approximate' column")?;
            let float_series = column_data
                .cast(&DataType::Float32)
                .context("Failed to cast 'Is construction date approximate' to Float32")?;
            let chunked = float_series
                .f32()
                .context("Failed to read 'Is construction date approximate' as f32")?;
            chunked
                .into_iter()
                .map(|value| match value {
                    Some(0.0) => false,
                    _ => true, // 1.0 or NaN → approximate
                })
                .collect()
        } else {
            vec![true; row_count] // default: all approximate
        };

        // Extract renovation_history: List<Struct{year: i32, event: str}>
        let mut renovation_raw: FxHashMap<u32, Vec<RenovationEvent>> = if has_renovation_history {
            tracing::info!("Extracting renovation history");
            let reno_col = df
                .column("renovation_history")
                .context("Missing renovation_history column")?;
            let list_ca = reno_col
                .list()
                .context("renovation_history is not a list column")?;

            let mut history: FxHashMap<u32, Vec<RenovationEvent>> = FxHashMap::default();
            for old_row in 0..row_count {
                if let Some(inner) = list_ca.get_as_series(old_row) {
                    if inner.is_empty() {
                        continue;
                    }
                    let structs = inner
                        .struct_()
                        .context("renovation_history inner is not a struct")?;
                    let years = structs
                        .field_by_name("year")
                        .context("Missing 'year' field in renovation_history struct")?;
                    let events = structs
                        .field_by_name("event")
                        .context("Missing 'event' field in renovation_history struct")?;

                    let mut row_events = Vec::new();
                    for idx in 0..inner.len() {
                        let year = years.get(idx).context("Failed to get year value")?;
                        let event = events.get(idx).context("Failed to get event value")?;
                        if let (AnyValue::Int32(yr), AnyValue::String(ev)) = (&year, &event) {
                            row_events.push(RenovationEvent {
                                year: *yr,
                                event: ev.to_string(),
                            });
                        }
                    }
                    if !row_events.is_empty() {
                        history.insert(old_row as u32, row_events);
                    }
                }
            }
            tracing::info!(
                properties_with_events = history.len(),
                "Renovation history extracted"
            );
            history
        } else {
            FxHashMap::default()
        };

        // Sort all rows by spatial locality so that grid queries access
        // contiguous memory (sequential reads instead of random DRAM accesses).
        tracing::info!("Sorting rows by spatial locality");
        let grid_cell_size = 0.01_f32;
        let min_lat_val = lat.iter().cloned().fold(f32::INFINITY, f32::min) - grid_cell_size;
        let min_lon_val = lon.iter().cloned().fold(f32::INFINITY, f32::min) - grid_cell_size;
        let max_lon_val = lon.iter().cloned().fold(f32::NEG_INFINITY, f32::max) + grid_cell_size;
        let grid_cols = ((max_lon_val - min_lon_val) / grid_cell_size).ceil() as u64 + 1;

        let mut perm: Vec<u32> = (0..row_count as u32).collect();
        perm.par_sort_unstable_by_key(|&perm_index| {
            let grid_row = ((lat[perm_index as usize] - min_lat_val) / grid_cell_size) as u64;
            let grid_col = ((lon[perm_index as usize] - min_lon_val) / grid_cell_size) as u64;
            grid_row * grid_cols + grid_col
        });

        let lat: Vec<f32> = perm
            .iter()
            .map(|&perm_index| lat[perm_index as usize])
            .collect();
        let lon: Vec<f32> = perm
            .iter()
            .map(|&perm_index| lon[perm_index as usize])
            .collect();
        let last_known_price_raw: Vec<f32> = numeric_names
            .iter()
            .position(|&name| name == "Last known price")
            .map(|price_idx| {
                perm.iter()
                    .map(|&perm_index| numeric_col_major[price_idx][perm_index as usize])
                    .collect()
            })
            .unwrap_or_else(|| vec![f32::NAN; row_count]);

        // Build contiguous address buffer and address search index (permuted)
        tracing::info!("Building interned strings");
        let total_addr_bytes: usize = address_raw.iter().map(|text| text.len()).sum();
        let mut address_buffer = String::with_capacity(total_addr_bytes);
        let mut address_offsets = Vec::with_capacity(row_count);
        let mut address_lengths = Vec::with_capacity(row_count);
        let mut address_token_index: FxHashMap<String, Vec<u32>> = FxHashMap::default();
        let mut address_search_rodeo = lasso::Rodeo::default();
        let mut address_search_token_keys: Vec<lasso::Spur> = Vec::new();
        let mut address_search_token_offsets = Vec::with_capacity(row_count);
        let mut address_search_token_lengths = Vec::with_capacity(row_count);
        for (new_row, &perm_index) in perm.iter().enumerate() {
            let addr = &address_raw[perm_index as usize];
            let offset = address_buffer.len() as u32;
            let length = addr.len().min(u16::MAX as usize) as u16;
            address_offsets.push(offset);
            address_lengths.push(length);
            address_buffer.push_str(&addr[..length as usize]);

            let search_tokens = address_search_tokens(addr);
            let token_offset = address_search_token_keys.len() as u32;
            let token_length = search_tokens.len().min(u16::MAX as usize) as u16;
            address_search_token_offsets.push(token_offset);
            address_search_token_lengths.push(token_length);

            for token in search_tokens.iter().take(token_length as usize) {
                let key = address_search_rodeo.get_or_intern(token);
                address_search_token_keys.push(key);

                if is_address_candidate_token(token) {
                    address_token_index
                        .entry(token.clone())
                        .or_default()
                        .push(new_row as u32);
                }
            }
        }
        let address_token_count_before_prune = address_token_index.len();
        address_token_index.retain(|_, rows| rows.len() <= ADDRESS_SEARCH_MAX_POSTINGS_PER_TOKEN);
        let address_prefix_index = build_address_prefix_index(&address_token_index);
        let address_search_interner = address_search_rodeo.into_reader();
        let address_postings_count: usize = address_token_index.values().map(Vec::len).sum();
        tracing::info!(
            tokens = address_token_index.len(),
            prefixes = address_prefix_index.len(),
            pruned_tokens =
                address_token_count_before_prune.saturating_sub(address_token_index.len()),
            postings = address_postings_count,
            row_tokens = address_search_token_keys.len(),
            "Address search index built"
        );

        // Intern postcodes (permuted)
        let mut postcode_rodeo = lasso::Rodeo::default();
        let mut postcode_keys: Vec<lasso::Spur> = Vec::with_capacity(row_count);
        let mut postcode_row_index: FxHashMap<lasso::Spur, Vec<u32>> = FxHashMap::default();
        for (new_row, &perm_index) in perm.iter().enumerate() {
            let key = postcode_rodeo.get_or_intern(&postcode_raw[perm_index as usize]);
            postcode_keys.push(key);
            postcode_row_index
                .entry(key)
                .or_default()
                .push(new_row as u32);
        }
        let postcode_interner = postcode_rodeo.into_reader();

        // Pack is_approx_build_date into a bitvec (8 bools per byte)
        let num_bytes = row_count.div_ceil(8);
        let mut approx_build_date_bits = vec![0u8; num_bytes];
        for (new_row, &old_row) in perm.iter().enumerate() {
            if is_approx_build_date_raw[old_row as usize] {
                approx_build_date_bits[new_row / 8] |= 1 << (new_row % 8);
            }
        }

        // Re-key renovation_history by permuted row index
        let renovation_history: FxHashMap<u32, Vec<RenovationEvent>> = {
            let mut map =
                FxHashMap::with_capacity_and_hasher(renovation_raw.len(), Default::default());
            for (new_row, &old_row) in perm.iter().enumerate() {
                if let Some(events) = renovation_raw.remove(&old_row) {
                    map.insert(new_row as u32, events);
                }
            }
            map
        };

        // Permute optional string columns into sparse HashMaps
        let property_sub_type: FxHashMap<u32, String> = {
            let mut map = FxHashMap::default();
            for (new_row, &old_row) in perm.iter().enumerate() {
                if let Some(ref s) = property_sub_type_raw[old_row as usize] {
                    map.insert(new_row as u32, s.clone());
                }
            }
            map
        };
        let price_qualifier: FxHashMap<u32, String> = {
            let mut map = FxHashMap::default();
            for (new_row, &old_row) in perm.iter().enumerate() {
                if let Some(ref s) = price_qualifier_raw[old_row as usize] {
                    map.insert(new_row as u32, s.clone());
                }
            }
            map
        };

        // Build enum_values map: feature_index -> list of string values
        // and enum_counts map: feature_index -> per-value global counts
        let mut enum_values: rustc_hash::FxHashMap<usize, Vec<String>> =
            rustc_hash::FxHashMap::default();
        let mut enum_counts: rustc_hash::FxHashMap<usize, Vec<u64>> =
            rustc_hash::FxHashMap::default();
        for (enum_idx, (values, encoded)) in enum_col_major.iter().enumerate() {
            let feature_idx = num_numeric + enum_idx;
            enum_values.insert(feature_idx, values.clone());
            let mut counts = vec![0u64; values.len()];
            for &val in encoded {
                if val.is_finite() {
                    let idx = val as usize;
                    if idx < counts.len() {
                        counts[idx] += 1;
                    }
                }
            }
            enum_counts.insert(feature_idx, counts);
        }

        // Build feature_stats: numeric stats + placeholder stats for enums
        let mut feature_stats = numeric_feature_stats;
        for (values, _) in &enum_col_major {
            // For enum features, slider range is 0 to num_values-1
            let num_values = values.len();
            let max_val = num_values as f32;
            feature_stats.push(FeatureStats {
                slider_min: 0.0,
                slider_max: (num_values.saturating_sub(1)) as f32,
                histogram: Histogram {
                    min: 0.0,
                    max: max_val,
                    p1: 0.0,
                    p99: max_val,
                    counts: vec![0; num_values.max(1)],
                },
            });
            // Enum features: not quantized, stored directly as u16
            quant_min.push(0.0);
            quant_range.push(0.0);
        }
        let dequant_a: Vec<f32> = quant_range
            .iter()
            .map(|&r| if r > 0.0 { r / QUANT_SCALE } else { 0.0 })
            .collect();

        // Transpose to row-major AND apply spatial permutation in one pass.
        // Combines numeric and enum features into a single feature_data array, quantized to u16.
        tracing::info!("Transposing to row-major layout (spatially sorted, quantized to u16)");
        let mut feature_data = vec![NAN_U16; row_count * num_features];
        feature_data
            .par_chunks_mut(num_features)
            .enumerate()
            .for_each(|(new_row, row_slice)| {
                let old_index = perm[new_row] as usize;
                // Numeric features: quantize to u16
                for (feat_idx, col_vec) in numeric_col_major.iter().enumerate() {
                    let value = col_vec[old_index];
                    row_slice[feat_idx] = if value.is_finite() {
                        let range = quant_range[feat_idx];
                        if range > 0.0 {
                            let normalized = (value - quant_min[feat_idx]) / range;
                            (normalized * QUANT_SCALE).round().clamp(0.0, QUANT_SCALE) as u16
                        } else {
                            0
                        }
                    } else {
                        NAN_U16
                    };
                }
                // Enum features: store as u16 directly
                for (enum_idx, (_, encoded)) in enum_col_major.iter().enumerate() {
                    let value = encoded[old_index];
                    row_slice[num_numeric + enum_idx] = if value.is_finite() {
                        value as u16
                    } else {
                        NAN_U16
                    };
                }
            });

        tracing::info!("Data loading complete");

        Ok(PropertyData {
            lat,
            lon,
            feature_names,
            num_features,
            num_numeric,
            feature_data,
            dequant_a,
            quant_min,
            quant_range,
            feature_stats,
            last_known_price_raw,
            address_buffer,
            address_offsets,
            address_lengths,
            postcode_interner,
            postcode_keys,
            postcode_row_index,
            address_token_index,
            address_prefix_index,
            address_search_interner,
            address_search_token_keys,
            address_search_token_offsets,
            address_search_token_lengths,
            enum_values,
            enum_counts,
            approx_build_date_bits,
            renovation_history,
            property_sub_type,
            price_qualifier,
        })
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::features::Bounds;

    fn make_fixed_bounds(min: f32, max: f32) -> Bounds {
        Bounds::Fixed { min, max }
    }

    fn make_percentile_bounds(low: f64, high: f64) -> Bounds {
        Bounds::Percentile { low, high }
    }

    #[test]
    fn full_postcode_detection_accepts_common_formats() {
        assert!(is_full_postcode_compact("SW1A1AA"));
        assert!(is_full_postcode_compact("E142DG"));
        assert!(is_full_postcode_compact("M11AE"));
        assert!(!is_full_postcode_compact("E14"));
        assert!(!is_full_postcode_compact("DOWNING"));
        assert!(!is_full_postcode_compact("10A"));
    }

    #[test]
    fn address_query_parsing_skips_postcodes_and_street_suffixes() {
        let parsed = parse_address_query("Flat 2, 10 Downing St, SW1A 2AA");

        assert_eq!(parsed.full_postcode.as_deref(), Some("SW1A 2AA"));
        assert_eq!(
            parsed.numeric_terms,
            vec!["10".to_string(), "2".to_string()]
        );
        assert_eq!(parsed.candidate_terms, vec!["downing".to_string()]);
        assert_eq!(parsed.text_groups.len(), 1);
        assert_eq!(
            parsed.text_groups[0].alternatives,
            vec!["downing".to_string()]
        );
    }

    #[test]
    fn address_query_parsing_handles_compact_postcodes() {
        let parsed = parse_address_query("10 downing street sw1a1aa");

        assert_eq!(parsed.full_postcode.as_deref(), Some("SW1A 1AA"));
        assert_eq!(parsed.numeric_terms, vec!["10".to_string()]);
        assert_eq!(parsed.candidate_terms, vec!["downing".to_string()]);
    }

    #[test]
    fn address_query_parsing_keeps_partial_terms_for_row_matching() {
        let parsed = parse_address_query("settlers cour");

        assert_eq!(parsed.full_postcode, None);
        assert_eq!(parsed.numeric_terms, Vec::<String>::new());
        assert_eq!(
            parsed.candidate_terms,
            vec!["settlers".to_string(), "cour".to_string()]
        );
        assert_eq!(parsed.text_groups.len(), 2);
        assert_eq!(
            parsed.text_groups[0].alternatives,
            vec!["settlers".to_string()]
        );
        assert_eq!(parsed.text_groups[1].alternatives, vec!["cour".to_string()]);
    }

    #[test]
    fn address_search_tokens_keep_actual_address_terms_for_scoring() {
        let tokens = address_search_tokens("Flat 2, 10 Downing Cour");

        assert_eq!(
            tokens,
            vec![
                "10".to_string(),
                "2".to_string(),
                "cour".to_string(),
                "downing".to_string(),
                "flat".to_string()
            ]
        );
    }

    #[test]
    fn address_prefix_index_finds_partial_address_terms() {
        let mut token_index: FxHashMap<String, Vec<u32>> = FxHashMap::default();
        token_index.insert("downing".to_string(), vec![1]);
        token_index.insert("downton".to_string(), vec![2]);
        token_index.insert("market".to_string(), vec![3]);

        let prefix_index = build_address_prefix_index(&token_index);

        assert_eq!(
            prefix_index.get("down").cloned().unwrap_or_default(),
            vec!["downing".to_string(), "downton".to_string()]
        );
        assert_eq!(
            prefix_index.get("downi").cloned().unwrap_or_default(),
            vec!["downing".to_string()]
        );
        assert_eq!(
            prefix_index.get("downt").cloned().unwrap_or_default(),
            vec!["downton".to_string()]
        );
        assert!(!prefix_index.contains_key("do"));
    }

    #[test]
    fn address_term_matching_allows_prefixes_and_aliases() {
        let tokens = tokenize_address_text("10 Downing Street");
        let prefix_group = address_term_group("down").expect("prefix term should be searchable");
        let alias_group = AddressTermGroup {
            alternatives: vec!["st".to_string(), "street".to_string()],
        };

        assert!(address_tokens_match_group(&tokens, &prefix_group));
        assert!(address_tokens_match_group(&tokens, &alias_group));
    }

    #[test]
    fn address_term_matching_uses_actual_token_prefixes() {
        let tokens = tokenize_address_text("12 Settlers Court");
        let prefix_group = address_term_group("cou").expect("partial term should be searchable");

        assert!(address_tokens_match_group(&tokens, &prefix_group));
    }

    #[test]
    fn histogram_empty_data() {
        let data: Vec<f32> = vec![];
        let bounds = make_fixed_bounds(0.0, 100.0);
        let stats = compute_feature_stats(&data, &bounds, false);

        assert_eq!(stats.slider_min, 0.0);
        assert_eq!(stats.slider_max, 100.0);
        assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 0);
    }

    #[test]
    fn histogram_single_value() {
        let data = vec![50.0_f32];
        let bounds = make_fixed_bounds(0.0, 100.0);
        let stats = compute_feature_stats(&data, &bounds, false);

        assert_eq!(stats.histogram.min, 50.0);
        assert_eq!(stats.histogram.max, 50.0);
        assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 1);
    }

    #[test]
    fn histogram_uniform_distribution() {
        let data: Vec<f32> = (0..100).map(|i| i as f32).collect();
        let bounds = make_fixed_bounds(0.0, 100.0);
        let stats = compute_feature_stats(&data, &bounds, false);

        assert_eq!(stats.histogram.min, 0.0);
        assert_eq!(stats.histogram.max, 99.0);
        assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 100);
    }

    #[test]
    fn histogram_with_nan_values() {
        let data = vec![10.0_f32, f32::NAN, 20.0, f32::NAN, 30.0];
        let bounds = make_fixed_bounds(0.0, 100.0);
        let stats = compute_feature_stats(&data, &bounds, false);

        assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 3);
        assert_eq!(stats.histogram.min, 10.0);
        assert_eq!(stats.histogram.max, 30.0);
    }

    #[test]
    fn histogram_all_nan() {
        let data = vec![f32::NAN, f32::NAN, f32::NAN];
        let bounds = make_fixed_bounds(0.0, 100.0);
        let stats = compute_feature_stats(&data, &bounds, false);

        assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 0);
    }

    #[test]
    fn histogram_all_same_value() {
        let data = vec![42.0_f32; 1000];
        let bounds = make_fixed_bounds(0.0, 100.0);
        let stats = compute_feature_stats(&data, &bounds, false);

        assert_eq!(stats.histogram.min, 42.0);
        assert_eq!(stats.histogram.max, 42.0);
        assert_eq!(stats.histogram.p1, 42.0);
        assert_eq!(stats.histogram.p99, 42.0);
        assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 1000);
    }

    #[test]
    fn histogram_percentile_bounds() {
        let mut data: Vec<f32> = vec![0.0]; // Low outlier
        data.extend((1..99).map(|i| 50.0 + i as f32 * 0.01));
        data.push(1000.0); // High outlier

        let bounds = make_percentile_bounds(2.0, 98.0);
        let stats = compute_feature_stats(&data, &bounds, false);

        assert!(stats.slider_min > 0.0);
        assert!(stats.slider_max < 1000.0);
    }

    #[test]
    fn fixed_price_bounds_keep_slider_cap() {
        let data = vec![400_000.0_f32, 2_500_000.0, 3_750_000.0];
        let bounds = make_fixed_bounds(0.0, 2_500_000.0);
        let stats = compute_feature_stats(&data, &bounds, false);

        assert_eq!(stats.slider_min, 0.0);
        assert_eq!(stats.slider_max, 2_500_000.0);
    }

    #[test]
    fn histogram_bin_for_value() {
        let hist = Histogram {
            min: 0.0,
            max: 100.0,
            p1: 10.0,
            p99: 90.0,
            counts: vec![0; 10],
        };

        assert_eq!(hist.bin_for_value(5.0), 0); // Low outlier bin
        assert_eq!(hist.bin_for_value(95.0), 9); // High outlier bin

        let mid_value = 50.0;
        let bin = hist.bin_for_value(mid_value);
        assert!((1..=8).contains(&bin));
    }

    #[test]
    fn histogram_middle_bin_width() {
        let hist = Histogram {
            min: 0.0,
            max: 100.0,
            p1: 10.0,
            p99: 90.0,
            counts: vec![0; 10],
        };

        let expected_width = (90.0 - 10.0) / 8.0;
        assert!((hist.middle_bin_width() - expected_width).abs() < 0.001);
    }

    #[test]
    fn histogram_cardinality_caps_bins() {
        let data = vec![1.0_f32, 1.0, 2.0, 2.0, 3.0, 3.0];
        let bounds = make_fixed_bounds(0.0, 100.0);
        let stats = compute_feature_stats(&data, &bounds, false);

        assert_eq!(stats.histogram.counts.len(), 3);
    }

    #[test]
    fn min_max_skips_nan() {
        let values = vec![10.0_f32, f32::NAN, 20.0, f32::NAN, 5.0];

        let mut min = f32::INFINITY;
        let mut max = f32::NEG_INFINITY;
        for &v in &values {
            if v.is_finite() {
                if v < min {
                    min = v;
                }
                if v > max {
                    max = v;
                }
            }
        }

        assert_eq!(min, 5.0);
        assert_eq!(max, 20.0);
    }

    #[test]
    fn count_skips_nan() {
        let values = [1.0_f32, f32::NAN, 2.0, f32::NAN, 3.0];
        let count = values.iter().filter(|v| v.is_finite()).count();
        assert_eq!(count, 3);
    }

    #[test]
    fn enum_value_counting() {
        let values = vec![0.0_f32, 1.0, 1.0, 2.0, f32::NAN, 3.0, 1.0];
        let enum_count = 4;

        let mut counts = vec![0u64; enum_count];
        for &v in &values {
            if v.is_finite() {
                let idx = v as usize;
                if idx < enum_count {
                    counts[idx] += 1;
                }
            }
        }

        assert_eq!(counts[0], 1);
        assert_eq!(counts[1], 3);
        assert_eq!(counts[2], 1);
        assert_eq!(counts[3], 1);
    }

    #[test]
    fn infinity_values_excluded() {
        let data = vec![f32::INFINITY, f32::NEG_INFINITY, 50.0];
        let bounds = Bounds::Fixed {
            min: 0.0,
            max: 100.0,
        };
        let stats = compute_feature_stats(&data, &bounds, false);

        assert_eq!(stats.histogram.min, 50.0);
        assert_eq!(stats.histogram.max, 50.0);
        assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 1);
    }

    #[test]
    fn only_finite_values() {
        let data = vec![10.0_f32, 20.0, 30.0];
        let bounds = Bounds::Fixed {
            min: 0.0,
            max: 100.0,
        };
        let stats = compute_feature_stats(&data, &bounds, false);

        assert_eq!(stats.histogram.min, 10.0);
        assert_eq!(stats.histogram.max, 30.0);
        assert_eq!(stats.histogram.counts.iter().sum::<u64>(), 3);
    }

    #[test]
    fn extreme_outlier_does_not_destroy_quantization() {
        // Simulate floor area: 10k normal values (50-200 sqm) + one 317M outlier
        let mut data: Vec<f32> = (0..10_000).map(|i| 50.0 + (i % 150) as f32).collect();
        data.push(317_000_000.0); // Extreme outlier from web scraping

        let bounds = make_percentile_bounds(0.0, 98.0);
        let stats = compute_feature_stats(&data, &bounds, false);

        // After refinement, histogram range should be much tighter than 317M
        assert!(
            stats.histogram.max < 1_000_000.0,
            "histogram.max should be refined, got {}",
            stats.histogram.max,
        );
        // p1 should be near 50, not millions
        assert!(
            stats.histogram.p1 < 100.0,
            "p1 should be near real data, got {}",
            stats.histogram.p1,
        );
        // Slider min should reflect actual data range
        assert!(
            stats.slider_min < 100.0,
            "slider_min should be near real data, got {}",
            stats.slider_min,
        );

        // Quantization using histogram.min/max should give usable range
        let qmin = stats.histogram.min;
        let qrange = stats.histogram.max - stats.histogram.min;
        assert!(qrange > 0.0 && qrange < 1_000_000.0);

        // A typical floor area (100 sqm) should be distinguishable from min
        let normalized = (100.0 - qmin) / qrange;
        let encoded = (normalized * QUANT_SCALE).round() as u16;
        assert!(
            encoded > 100,
            "100 sqm should encode to a meaningful u16 value, got {}",
            encoded,
        );
    }
}