use anyhow::{bail, Context}; use polars::lazy::frame::LazyFrame; use polars::prelude::*; use rayon::prelude::*; use serde::Serialize; use std::path::Path; use rustc_hash::{FxHashMap, FxHashSet}; use crate::consts::{H3_PRECOMPUTE_MAX, HISTOGRAM_BINS, NAN_U16, QUANT_SCALE}; use crate::features::{self, Bounds}; const ADDRESS_SEARCH_CANDIDATE_LIMIT: usize = 50_000; const ADDRESS_SEARCH_MAX_POSTINGS_PER_TOKEN: usize = 250_000; const ADDRESS_SEARCH_PREFIX_MIN_LEN: usize = 4; const ADDRESS_SEARCH_PREFIX_MAX_LEN: usize = 8; fn is_numeric_dtype(dtype: &DataType) -> bool { matches!( dtype, DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 | DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 | DataType::Float32 | DataType::Float64 | DataType::Datetime(_, _) | DataType::Date ) } fn is_datetime_dtype(dtype: &DataType) -> bool { matches!(dtype, DataType::Datetime(_, _) | DataType::Date) } #[derive(Clone, Debug)] struct AddressTermGroup { alternatives: Vec, } #[derive(Debug)] struct AddressQuery { full_postcode: Option, text_groups: Vec, numeric_terms: Vec, candidate_terms: Vec, } fn tokenize_address_text(text: &str) -> Vec { let mut tokens = Vec::new(); let mut current = String::new(); for ch in text.chars() { if ch.is_ascii_alphanumeric() { current.push(ch.to_ascii_lowercase()); } else if matches!(ch, '\'' | '’' | '`') { continue; } else if !current.is_empty() { tokens.push(std::mem::take(&mut current)); } } if !current.is_empty() { tokens.push(current); } tokens } fn is_full_postcode_compact(compact: &str) -> bool { let bytes = compact.as_bytes(); let len = bytes.len(); if !(5..=7).contains(&len) { return false; } let inward = &bytes[len - 3..]; if !inward[0].is_ascii_digit() || !inward[1].is_ascii_alphabetic() || !inward[2].is_ascii_alphabetic() { return false; } let outward = &bytes[..len - 3]; if !(2..=4).contains(&outward.len()) { return false; } outward[0].is_ascii_alphabetic() && outward.iter().all(u8::is_ascii_alphanumeric) && outward.iter().any(u8::is_ascii_digit) } fn canonical_postcode_from_compact(compact: &str) -> String { let upper = compact.to_ascii_uppercase(); let split = upper.len() - 3; format!("{} {}", &upper[..split], &upper[split..]) } fn extract_full_postcode(tokens: &[String]) -> Option<(String, Vec)> { for (idx, token) in tokens.iter().enumerate() { let compact = token.to_ascii_uppercase(); if is_full_postcode_compact(&compact) { return Some((canonical_postcode_from_compact(&compact), vec![idx])); } } for idx in 0..tokens.len().saturating_sub(1) { let compact = format!( "{}{}", tokens[idx].to_ascii_uppercase(), tokens[idx + 1].to_ascii_uppercase() ); if is_full_postcode_compact(&compact) { return Some(( canonical_postcode_from_compact(&compact), vec![idx, idx + 1], )); } } None } fn looks_like_postcode_fragment(token: &str) -> bool { (2..=4).contains(&token.len()) && token .chars() .next() .is_some_and(|ch| ch.is_ascii_alphabetic()) && token.chars().any(|ch| ch.is_ascii_digit()) && token.chars().all(|ch| ch.is_ascii_alphanumeric()) } fn is_numeric_address_token(token: &str) -> bool { token.chars().all(|ch| ch.is_ascii_digit()) } fn address_token_aliases(token: &str) -> Vec<&'static str> { match token { "apt" => vec!["apt", "apartment"], "apartment" => vec!["apartment", "apt"], "ave" => vec!["ave", "avenue"], "avenue" => vec!["avenue", "ave"], "blvd" => vec!["blvd", "boulevard"], "boulevard" => vec!["boulevard", "blvd"], "cl" => vec!["cl", "close"], "close" => vec!["close", "cl"], "ct" => vec!["ct", "court"], "court" => vec!["court", "ct"], "cres" => vec!["cres", "crescent"], "crescent" => vec!["crescent", "cres"], "dr" => vec!["dr", "drive"], "drive" => vec!["drive", "dr"], "fl" => vec!["fl", "flat"], "flat" => vec!["flat", "fl"], "gdns" => vec!["gdns", "gardens", "garden"], "garden" => vec!["garden", "gardens", "gdns"], "gardens" => vec!["gardens", "garden", "gdns"], "hse" => vec!["hse", "house"], "house" => vec!["house", "hse"], "ln" => vec!["ln", "lane"], "lane" => vec!["lane", "ln"], "rd" => vec!["rd", "road"], "road" => vec!["road", "rd"], "sq" => vec!["sq", "square"], "square" => vec!["square", "sq"], "st" => vec!["st", "street", "saint"], "street" => vec!["street", "st"], "saint" => vec!["saint", "st"], "terr" => vec!["terr", "terrace"], "terrace" => vec!["terrace", "terr"], _ => Vec::new(), } } fn is_address_stop_token(token: &str) -> bool { matches!( token, "a" | "an" | "and" | "apartment" | "apt" | "avenue" | "ave" | "block" | "building" | "bungalow" | "close" | "cl" | "court" | "ct" | "cres" | "crescent" | "drive" | "dr" | "estate" | "flat" | "fl" | "floor" | "garden" | "gardens" | "gdns" | "grove" | "house" | "hse" | "lane" | "ln" | "lodge" | "mansions" | "mews" | "of" | "park" | "place" | "road" | "rd" | "room" | "row" | "saint" | "sq" | "square" | "st" | "street" | "terr" | "terrace" | "the" | "unit" | "view" | "villas" | "walk" | "way" | "yard" ) } fn address_term_group(token: &str) -> Option { if token.len() < 3 || is_numeric_address_token(token) || looks_like_postcode_fragment(token) { return None; } let mut alternatives = Vec::new(); alternatives.push(token.to_string()); for alias in address_token_aliases(token) { if !alternatives.iter().any(|existing| existing == alias) { alternatives.push(alias.to_string()); } } if alternatives .iter() .all(|alternative| is_address_stop_token(alternative)) { return None; } Some(AddressTermGroup { alternatives }) } fn address_search_tokens(text: &str) -> Vec { let mut tokens: Vec = tokenize_address_text(text) .into_iter() .filter(|token| is_address_search_token(token)) .collect(); tokens.sort_unstable(); tokens.dedup(); tokens } fn is_address_search_token(token: &str) -> bool { if looks_like_postcode_fragment(token) { return false; } if is_numeric_address_token(token) { return true; } if token.chars().any(|ch| ch.is_ascii_digit()) { return token.len() >= 2; } token.len() >= 3 } fn is_address_candidate_token(token: &str) -> bool { !is_numeric_address_token(token) && !looks_like_postcode_fragment(token) && (token.chars().any(|ch| ch.is_ascii_digit()) || (token.len() >= 3 && !is_address_stop_token(token))) } fn address_prefix_key(term: &str) -> &str { if term.len() > ADDRESS_SEARCH_PREFIX_MAX_LEN { &term[..ADDRESS_SEARCH_PREFIX_MAX_LEN] } else { term } } fn build_address_prefix_index( address_token_index: &FxHashMap>, ) -> FxHashMap> { let mut prefix_index: FxHashMap> = FxHashMap::default(); for token in address_token_index.keys() { let max_prefix_len = token.len().min(ADDRESS_SEARCH_PREFIX_MAX_LEN); for prefix_len in ADDRESS_SEARCH_PREFIX_MIN_LEN..=max_prefix_len { prefix_index .entry(token[..prefix_len].to_string()) .or_default() .push(token.clone()); } } for tokens in prefix_index.values_mut() { tokens.sort_unstable(); tokens.dedup(); } prefix_index } fn parse_address_query(query: &str) -> AddressQuery { let tokens = tokenize_address_text(query); let (full_postcode, postcode_token_indices) = extract_full_postcode(&tokens) .map(|(postcode, indices)| (Some(postcode), indices)) .unwrap_or((None, Vec::new())); let skip_postcode_tokens: FxHashSet = postcode_token_indices.into_iter().collect(); let mut text_groups = Vec::new(); let mut numeric_terms = Vec::new(); let mut candidate_terms = Vec::new(); for (idx, token) in tokens.iter().enumerate() { if skip_postcode_tokens.contains(&idx) || looks_like_postcode_fragment(token) { continue; } if is_numeric_address_token(token) { numeric_terms.push(token.clone()); continue; } if let Some(group) = address_term_group(token) { for alternative in &group.alternatives { if !is_address_stop_token(alternative) && !candidate_terms.iter().any(|term| term == alternative) { candidate_terms.push(alternative.clone()); } } text_groups.push(group); } else if token.chars().any(|ch| ch.is_ascii_digit()) && token.len() >= 2 { numeric_terms.push(token.clone()); if !candidate_terms.iter().any(|term| term == token) { candidate_terms.push(token.clone()); } } } text_groups.dedup_by(|left, right| left.alternatives == right.alternatives); numeric_terms.sort_unstable(); numeric_terms.dedup(); AddressQuery { full_postcode, text_groups, numeric_terms, candidate_terms, } } fn token_matches_query_term(token: &str, query_term: &str) -> bool { token == query_term || (query_term.len() >= 3 && token.starts_with(query_term)) } fn token_matches_numeric_term(token: &str, query_term: &str) -> bool { token == query_term || token.starts_with(query_term) } #[cfg(test)] fn address_tokens_match_group(tokens: &[String], group: &AddressTermGroup) -> bool { group.alternatives.iter().any(|alternative| { tokens .iter() .any(|token| token_matches_query_term(token, alternative)) }) } /// Histogram with outlier buckets at the edges. /// - Bin 0: [min, p1) — low outliers /// - Bins 1 to n-2: [p1, p99) — main distribution, evenly divided /// - Bin n-1: [p99, max] — high outliers #[derive(Serialize, Clone)] pub struct Histogram { pub min: f32, pub max: f32, /// 1st percentile (left edge of main distribution) pub p1: f32, /// 99th percentile (right edge of main distribution) pub p99: f32, pub counts: Vec, } impl Histogram { /// Return the bin index for a given value using the outlier-bracket layout. #[cfg(test)] pub fn bin_for_value(&self, value: f32) -> usize { let num_bins = self.counts.len(); if value < self.p1 { 0 } else if value >= self.p99 { num_bins - 1 } else { let middle_bins = num_bins.saturating_sub(2); if middle_bins > 0 && self.p99 > self.p1 { let width = (self.p99 - self.p1) / middle_bins as f32; let middle_bin = ((value - self.p1) / width) as usize; (1 + middle_bin).min(num_bins - 2) } else { num_bins / 2 } } } /// Width of a single middle bin (bins 1..n-2). #[cfg(test)] pub fn middle_bin_width(&self) -> f32 { let middle_bins = self.counts.len().saturating_sub(2); if middle_bins > 0 && self.p99 > self.p1 { (self.p99 - self.p1) / middle_bins as f32 } else { 0.0 } } } pub struct FeatureStats { pub slider_min: f32, pub slider_max: f32, pub histogram: Histogram, } #[derive(Serialize, Clone)] pub struct RenovationEvent { pub year: i32, pub event: String, } /// Lightweight reference to quantization parameters for decoding u16 feature data. pub struct QuantRef<'a> { pub dequant_a: &'a [f32], pub quant_min: &'a [f32], pub quant_range: &'a [f32], pub num_numeric: usize, } impl QuantRef<'_> { /// Decode a raw u16 value back to f32. #[inline] pub fn decode(&self, feat_idx: usize, raw: u16) -> f32 { if raw == NAN_U16 { return f32::NAN; } if feat_idx >= self.num_numeric { raw as f32 } else { raw as f32 * self.dequant_a[feat_idx] + self.quant_min[feat_idx] } } /// Encode a filter minimum bound to u16 (floors to include boundary values). #[inline] pub fn encode_min(&self, feat_idx: usize, value: f32) -> u16 { if !value.is_finite() || self.quant_range[feat_idx] == 0.0 { return 0; } let norm = (value - self.quant_min[feat_idx]) / self.quant_range[feat_idx]; (norm * QUANT_SCALE).floor().clamp(0.0, QUANT_SCALE) as u16 } /// Encode a filter maximum bound to u16 (ceils to include boundary values). #[inline] pub fn encode_max(&self, feat_idx: usize, value: f32) -> u16 { if !value.is_finite() || self.quant_range[feat_idx] == 0.0 { return QUANT_SCALE as u16; } let norm = (value - self.quant_min[feat_idx]) / self.quant_range[feat_idx]; (norm * QUANT_SCALE).ceil().clamp(0.0, QUANT_SCALE) as u16 } } pub struct PropertyData { pub lat: Vec, pub lon: Vec, pub feature_names: Vec, pub num_features: usize, /// Number of numeric features (enum features start at this index). pub num_numeric: usize, /// Row-major flat array: feature_data[row * num_features + feat_idx]. /// Quantized to u16. NaN sentinel = u16::MAX (65535). /// Numeric features: encoded via (val - min) / range * 65534. /// Enum features: stored directly as u16 cast of the f32 index. pub feature_data: Vec, /// Per-feature: range / QUANT_SCALE for fast decode. dequant_a: Vec, /// Per-feature: minimum value (offset for dequantization). quant_min: Vec, /// Per-feature: max - min (for encoding filter bounds). quant_range: Vec, pub feature_stats: Vec, /// Unquantized last sale price used by the price-history chart. last_known_price_raw: Vec, /// Contiguous buffer holding all address strings end-to-end. address_buffer: String, /// Byte offset into `address_buffer` where each row's address starts. address_offsets: Vec, /// Length in bytes of each row's address. address_lengths: Vec, /// Interned postcodes: reader is thread-safe, keys index into it. postcode_interner: lasso::RodeoReader, postcode_keys: Vec, /// Rows for each postcode, keyed by the interned postcode key. postcode_row_index: FxHashMap>, /// Inverted index from address tokens to property rows. address_token_index: FxHashMap>, /// Prefix lookup from typed address-token prefix to indexed full address tokens. address_prefix_index: FxHashMap>, /// Interned normalized address-search tokens used for per-row scoring. address_search_interner: lasso::RodeoReader, /// Flat per-row normalized address-search token keys. address_search_token_keys: Vec, /// Offset into `address_search_token_keys` for each row. address_search_token_offsets: Vec, /// Number of normalized address-search token keys for each row. address_search_token_lengths: Vec, /// For enum features: maps feature index to list of possible string values. /// Index in values list corresponds to the u16 value stored in feature_data. pub enum_values: rustc_hash::FxHashMap>, /// For enum features: maps feature index to per-value global counts (same order as enum_values). pub enum_counts: rustc_hash::FxHashMap>, /// Per-row flag: true = construction date is approximate (from EPC band), /// false = exact (from new-build transaction date). /// Bit-packed: byte `row / 8`, bit `row % 8`. 8x smaller than Vec. approx_build_date_bits: Vec, /// Per-row renovation events. Keyed by (permuted) row index. /// Only rows with events are present in the map. renovation_history: FxHashMap>, property_sub_type: FxHashMap, price_qualifier: FxHashMap, } impl PropertyData { /// Get the address string for a given row. pub fn address(&self, row: usize) -> &str { let offset = self.address_offsets[row] as usize; let length = self.address_lengths[row] as usize; &self.address_buffer[offset..offset + length] } /// Get the postcode string for a given row. pub fn postcode(&self, row: usize) -> &str { self.postcode_interner.resolve(&self.postcode_keys[row]) } /// Get postcode components for field-level borrowing (avoids conflicting borrows with feature_data). pub fn postcode_parts(&self) -> (&lasso::RodeoReader, &[lasso::Spur]) { (&self.postcode_interner, &self.postcode_keys) } fn row_address_search_tokens(&self, row: usize) -> &[lasso::Spur] { let offset = self.address_search_token_offsets[row] as usize; let length = self.address_search_token_lengths[row] as usize; &self.address_search_token_keys[offset..offset + length] } /// Search individual property addresses. Full postcode queries use a direct row index; /// free-text queries use a small inverted index over distinctive address tokens. pub fn search_addresses(&self, query: &str, limit: usize) -> Vec { if limit == 0 { return Vec::new(); } let parsed = parse_address_query(query); if parsed.full_postcode.is_none() && parsed.text_groups.is_empty() && parsed.numeric_terms.is_empty() { return Vec::new(); } let candidate_rows: Vec = if let Some(postcode) = parsed.full_postcode.as_deref() { self.postcode_interner .get(postcode) .and_then(|key| self.postcode_row_index.get(&key)) .map(|rows| rows.to_vec()) .unwrap_or_default() } else if let Some(rows) = self.best_address_token_rows(&parsed.candidate_terms) { rows.iter() .take(ADDRESS_SEARCH_CANDIDATE_LIMIT) .copied() .collect() } else { Vec::new() }; if candidate_rows.is_empty() { return Vec::new(); } let mut scored: Vec<(i32, usize, usize)> = candidate_rows .into_iter() .filter_map(|row| { let row = row as usize; self.address_match_score(row, &parsed) .map(|score| (score, self.address(row).len(), row)) }) .collect(); scored.sort_unstable_by(|left, right| { right .0 .cmp(&left.0) .then(left.1.cmp(&right.1)) .then(left.2.cmp(&right.2)) }); let mut seen = FxHashSet::default(); let mut results = Vec::with_capacity(limit); for (_, _, row) in scored { let address = self.address(row).trim(); if address.is_empty() { continue; } let key = format!("{}\n{}", address.to_ascii_lowercase(), self.postcode(row)); if !seen.insert(key) { continue; } results.push(row); if results.len() == limit { break; } } results } fn best_address_token_rows(&self, terms: &[String]) -> Option<&[u32]> { let mut best: Option<&[u32]> = None; for term in terms { if let Some(rows) = self.address_token_index.get(term) { if best.is_none_or(|current| rows.len() < current.len()) { best = Some(rows.as_slice()); } continue; } if term.len() < 4 { continue; } if let Some(tokens) = self.address_prefix_index.get(address_prefix_key(term)) { for token in tokens { if !token.starts_with(term) { continue; } if let Some(rows) = self.address_token_index.get(token) { if best.is_none_or(|current| rows.len() < current.len()) { best = Some(rows.as_slice()); } } } } } best } fn address_match_score(&self, row: usize, parsed: &AddressQuery) -> Option { if self.address(row).trim().is_empty() { return None; } let tokens = self.row_address_search_tokens(row); if parsed .text_groups .iter() .any(|group| !self.address_tokens_match_group(tokens, group)) { return None; } let numeric_matches = parsed .numeric_terms .iter() .filter(|term| { tokens.iter().any(|token| { token_matches_numeric_term(self.address_search_interner.resolve(token), term) }) }) .count(); if !parsed.numeric_terms.is_empty() && numeric_matches == 0 { return None; } let mut score = 0; if parsed.full_postcode.is_some() { score += 1_000; } score += (parsed.text_groups.len() as i32) * 200; score += (numeric_matches as i32) * 90; if numeric_matches == parsed.numeric_terms.len() && numeric_matches > 0 { score += 50; } Some(score) } fn address_tokens_match_group(&self, tokens: &[lasso::Spur], group: &AddressTermGroup) -> bool { group.alternatives.iter().any(|alternative| { tokens.iter().any(|token| { token_matches_query_term(self.address_search_interner.resolve(token), alternative) }) }) } /// Get the is_approx_build_date flag for a given row (bit-packed). pub fn is_approx_build_date(&self, row: usize) -> bool { let byte = self.approx_build_date_bits[row / 8]; byte & (1 << (row % 8)) != 0 } /// Get renovation events for a given row (empty slice if none). pub fn renovation_history(&self, row: usize) -> &[RenovationEvent] { self.renovation_history .get(&(row as u32)) .map(|v| v.as_slice()) .unwrap_or(&[]) } /// Get property sub-type for a given row. pub fn property_sub_type(&self, row: usize) -> Option<&str> { self.property_sub_type .get(&(row as u32)) .map(String::as_str) } /// Get price qualifier for a given row. pub fn price_qualifier(&self, row: usize) -> Option<&str> { self.price_qualifier.get(&(row as u32)).map(String::as_str) } /// Get the unquantized last sale price for charting. #[inline] pub fn last_known_price_raw(&self, row: usize) -> f32 { self.last_known_price_raw[row] } /// Decode a single feature value from quantized u16 storage. #[inline] pub fn get_feature(&self, row: usize, feat_idx: usize) -> f32 { let raw = self.feature_data[row * self.num_features + feat_idx]; if raw == NAN_U16 { return f32::NAN; } if feat_idx >= self.num_numeric { raw as f32 } else { raw as f32 * self.dequant_a[feat_idx] + self.quant_min[feat_idx] } } /// Get a QuantRef for passing to aggregation/filter functions. pub fn quant_ref(&self) -> QuantRef<'_> { QuantRef { dequant_a: &self.dequant_a, quant_min: &self.quant_min, quant_range: &self.quant_range, num_numeric: self.num_numeric, } } } /// Compute a percentile from a uniformly-binned histogram. /// `prelim_counts` are uniform bins over [min, max]. fn percentile_from_uniform_histogram( count: usize, min: f32, max: f32, prelim_counts: &[u64], percentile: f32, ) -> f32 { if count == 0 || prelim_counts.is_empty() { return min; } let target = (count as f64 * percentile as f64 / 100.0).floor() as u64; let bin_width = (max - min) / prelim_counts.len() as f32; let mut cumulative = 0u64; for (i, &bin_count) in prelim_counts.iter().enumerate() { let prev_cumulative = cumulative; cumulative += bin_count; if cumulative > target { // Interpolate within this bin let bin_start = min + i as f32 * bin_width; let fraction = if bin_count > 0 { (target - prev_cumulative) as f32 / bin_count as f32 } else { 0.0 }; return bin_start + fraction * bin_width; } } max } /// Build a histogram and compute slider bounds based on the feature's Bounds config. pub fn compute_feature_stats(vals: &[f32], bounds: &Bounds, integer_bins: bool) -> FeatureStats { // Single pass: min, max, count (skipping NaN and infinity) let mut min = f32::INFINITY; let mut max = f32::NEG_INFINITY; let mut count = 0usize; for &value in vals { if value.is_finite() { if value < min { min = value; } if value > max { max = value; } count += 1; } } if count == 0 { let (slider_min, slider_max) = match bounds { Bounds::Fixed { min: fmin, max: fmax, } => (*fmin, *fmax), Bounds::Percentile { .. } => (0.0, 0.0), }; return FeatureStats { slider_min, slider_max, histogram: Histogram { min: 0.0, max: 0.0, p1: 0.0, p99: 0.0, counts: vec![0; HISTOGRAM_BINS], }, }; } // Build preliminary histogram with uniform bins to compute percentiles // Use full HISTOGRAM_BINS for percentile precision let range = if max == min { 1.0 } else { max - min }; let prelim_max = min + range * (1.0 + 1e-6); let prelim_bin_width = (prelim_max - min) / HISTOGRAM_BINS as f32; let mut prelim_counts = vec![0u64; HISTOGRAM_BINS]; for &value in vals { if value.is_finite() { let bin = ((value - min) / prelim_bin_width) as usize; prelim_counts[bin.min(HISTOGRAM_BINS - 1)] += 1; } } // Compute p1 and p99 from preliminary histogram let mut p1 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 1.0); let mut p99 = percentile_from_uniform_histogram(count, min, max, &prelim_counts, 99.0); // Iterative refinement for outlier-dominated distributions. // When extreme outliers (e.g. 317M sqm from web scraping) dominate the range, // the uniform histogram puts all real data in one bin, making percentile // estimation useless. Zoom into the estimated data region and recompute. let mut refined_counts = prelim_counts; let mut refined_count = count; let mut refined_min = min; let mut refined_max = max; for _ in 0..3 { let iqr = p99 - p1; if iqr <= 0.0 || (refined_max - refined_min) <= 5.0 * iqr { break; } let new_min = (p1 - iqr).max(min); let new_max = p99 + iqr; if new_max <= new_min { break; } let bin_width = (new_max - new_min) / HISTOGRAM_BINS as f32; let mut counts = vec![0u64; HISTOGRAM_BINS]; let mut cnt = 0usize; for &value in vals { if value.is_finite() && value >= new_min && value <= new_max { let bin = ((value - new_min) / bin_width) as usize; counts[bin.min(HISTOGRAM_BINS - 1)] += 1; cnt += 1; } } if cnt == 0 { break; } p1 = percentile_from_uniform_histogram(cnt, new_min, new_max, &counts, 1.0); p99 = percentile_from_uniform_histogram(cnt, new_min, new_max, &counts, 99.0); refined_counts = counts; refined_count = cnt; refined_min = new_min; refined_max = new_max; } // For integer-binned features, snap p1/p99 to integer boundaries // so each middle bin is exactly 1 unit wide. if integer_bins { p1 = p1.floor(); p99 = p99.ceil(); } // Determine number of histogram bins let num_bins = if integer_bins && p99 > p1 { // One middle bin per integer + 2 outlier bins (p99 - p1) as usize + 2 } else { // Count unique values within the p1–p99 range to cap histogram bins. // Using the full-range cardinality would over-allocate bins when outliers // inflate it (e.g. bedrooms: 1–137 unique values but only ~10 within p1–p99). let cardinality = { let mut unique_set = rustc_hash::FxHashSet::default(); for &val in vals { if val.is_finite() && val >= p1 && val <= p99 { unique_set.insert(val.to_bits()); } } unique_set.len() }; HISTOGRAM_BINS.min(cardinality).max(3) }; // Build final histogram with outlier bins at edges: // - Bin 0: [min, p1) — low outliers // - Bins 1 to n-2: [p1, p99) — main distribution, evenly divided // - Bin n-1: [p99, max] — high outliers let mut counts = vec![0u64; num_bins]; let middle_bins = num_bins.saturating_sub(2); let middle_width = if middle_bins > 0 && p99 > p1 { (p99 - p1) / middle_bins as f32 } else { 0.0 }; for &value in vals { if value.is_finite() { let bin = if value < p1 { 0 // Low outlier bin } else if value >= p99 { num_bins - 1 // High outlier bin } else if middle_width > 0.0 { // Middle bins (1 to n-2) let middle_bin = ((value - p1) / middle_width) as usize; (1 + middle_bin).min(num_bins - 2) } else { num_bins / 2 // Fallback if p1 == p99 }; counts[bin] += 1; } } let histogram = Histogram { min: refined_min, max: refined_max, p1, p99, counts, }; // Compute slider bounds (use refined histogram for accurate percentiles) let (slider_min, slider_max) = match bounds { Bounds::Fixed { min: fmin, max: fmax, } => (*fmin, *fmax), Bounds::Percentile { low, high } => { let p_low = percentile_from_uniform_histogram( refined_count, refined_min, refined_max, &refined_counts, *low as f32, ); let p_high = percentile_from_uniform_histogram( refined_count, refined_min, refined_max, &refined_counts, *high as f32, ); (p_low, p_high) } }; FeatureStats { slider_min, slider_max, histogram, } } fn column_to_f32_vec(column: &Column) -> anyhow::Result> { let float_series = column .cast(&DataType::Float32) .context("Failed to cast column to Float32")?; let chunked = float_series .f32() .context("Failed to get f32 chunked array")?; Ok(chunked .into_iter() .map(|value| value.unwrap_or(f32::NAN)) .collect()) } /// Precompute H3 cell IDs for all rows at the maximum resolution only. /// Parent cells for lower resolutions are derived on the fly via `CellIndex::parent()`. pub fn precompute_h3(lat: &[f32], lon: &[f32]) -> anyhow::Result> { let res = H3_PRECOMPUTE_MAX; tracing::info!("Precomputing H3 cells at resolution {}", res); let h3_res = h3o::Resolution::try_from(res).with_context(|| format!("Invalid H3 resolution: {res}"))?; let cells: Vec = lat .par_iter() .zip(lon.par_iter()) .enumerate() .map(|(i, (&latitude, &longitude))| { let coord = h3o::LatLng::new(latitude as f64, longitude as f64).unwrap_or_else(|err| { panic!( "Invalid coordinates at row {}: lat={}, lon={}: {}", i, latitude, longitude, err ) }); u64::from(coord.to_cell(h3_res)) }) .collect(); tracing::info!("H3 precomputation complete ({} cells)", cells.len()); Ok(cells) } impl PropertyData { pub fn load(properties_path: &Path, postcode_features_path: &Path) -> anyhow::Result { // Load postcode.parquet tracing::info!( "Loading postcode features from {:?}", postcode_features_path ); let postcode_df = LazyFrame::scan_parquet(postcode_features_path, Default::default()) .context("Failed to scan postcode parquet")? .collect() .context("Failed to read postcode parquet")?; tracing::info!(rows = postcode_df.height(), "Postcode features loaded"); // Load properties.parquet and join with postcode data for lat/lon + area features tracing::info!("Loading properties from {:?}", properties_path); let properties_lf = LazyFrame::scan_parquet(properties_path, Default::default()) .context("Failed to scan properties parquet")?; let combined = properties_lf .join( postcode_df.clone().lazy(), [col("Postcode")], [col("Postcode")], JoinArgs::new(JoinType::Left), ) .collect() .context("Failed to join properties with postcodes")?; let total_rows = combined.height(); tracing::info!(rows = total_rows, "Properties joined with postcodes"); // Get configured feature/enum names in config order let numeric_names = features::all_numeric_feature_names(); let enum_names = features::all_enum_feature_names(); let schema = combined.schema(); for name in &numeric_names { match schema.get(name) { Some(dtype) if is_numeric_dtype(dtype) => {} Some(dtype) => bail!( "Configured numeric feature '{}' has non-numeric type {:?}", name, dtype ), None => bail!( "Configured numeric feature '{}' not found in combined schema", name ), } } for name in &enum_names { match schema.get(name) { Some(dtype) if matches!(dtype, DataType::String) || dtype.is_categorical() => {} Some(dtype) => bail!( "Configured enum feature '{}' has unexpected type {:?}", name, dtype ), None => bail!( "Configured enum feature '{}' not found in combined schema", name ), } } // Combine numeric and enum feature names (numeric first, then enum) let feature_names: Vec = numeric_names .iter() .chain(enum_names.iter()) .map(|name| name.to_string()) .collect(); let num_features = feature_names.len(); let num_numeric = numeric_names.len(); tracing::info!( numeric = num_numeric, enums = enum_names.len(), total = num_features, "Feature columns from config" ); // Build select expressions for the combined DataFrame let mut select_exprs: Vec = vec![]; select_exprs.push(col("lat").cast(DataType::Float32)); select_exprs.push(col("lon").cast(DataType::Float32)); // Select numeric features as Float32 (datetime columns → fractional year) for &name in &numeric_names { if is_datetime_dtype(schema.get(name).unwrap()) { select_exprs.push( (col(name).dt().year().cast(DataType::Float32) + (col(name).dt().month().cast(DataType::Float32) - lit(1.0f32)) / lit(12.0f32)) .alias(name), ); } else { select_exprs.push(col(name).cast(DataType::Float32)); } } // String columns for address/postcode and property metadata for &string_col_name in &[ "Address per Property Register", "Address per EPC", "Postcode", "Property sub-type", "Price qualifier", ] { if schema.get(string_col_name).is_some() { select_exprs.push(col(string_col_name).cast(DataType::String)); } } // Enum features as String for &name in &enum_names { select_exprs.push(col(name).cast(DataType::String)); } // Optional columns let has_approx_col = schema.get("Is construction date approximate").is_some(); if has_approx_col { select_exprs.push(col("Is construction date approximate").cast(DataType::Float32)); } let has_renovation_history = schema.get("renovation_history").is_some(); if has_renovation_history { select_exprs.push(col("renovation_history")); } let df = combined .lazy() .filter(col("lat").is_not_null().and(col("lon").is_not_null())) .select(select_exprs) .collect() .context("Failed to select columns from combined data")?; let row_count = df.height(); if row_count == 0 { bail!("No property rows have usable coordinates after joining postcode data"); } let dropped_coordinate_rows = total_rows.saturating_sub(row_count); if dropped_coordinate_rows > 0 { tracing::warn!( rows = dropped_coordinate_rows, "Dropped properties with missing postcode coordinates" ); } tracing::info!(rows = row_count, "Combined data selected"); let lat_series = df .column("lat") .context("Missing 'lat' column")? .cast(&DataType::Float32) .context("Failed to cast 'lat' to Float32")?; let lat: Vec = lat_series .f32() .context("Failed to read 'lat' as f32")? .into_iter() .map(|value| value.context("Missing 'lat' value after coordinate filter")) .collect::>>()?; let lon_series = df .column("lon") .context("Missing 'lon' column")? .cast(&DataType::Float32) .context("Failed to cast 'lon' to Float32")?; let lon: Vec = lon_series .f32() .context("Failed to read 'lon' as f32")? .into_iter() .map(|value| value.context("Missing 'lon' value after coordinate filter")) .collect::>>()?; for (row, (&latitude, &longitude)) in lat.iter().zip(&lon).enumerate() { if !(-90.0..=90.0).contains(&latitude) || !(-180.0..=180.0).contains(&longitude) { bail!("Invalid coordinates at row {row}: lat={latitude}, lon={longitude}"); } } tracing::info!("Extracting numeric feature columns"); let numeric_col_major: Vec> = numeric_names .par_iter() .map(|name| { let column = df .column(name) .with_context(|| format!("Missing feature column '{name}'"))?; column_to_f32_vec(column) }) .collect::>>()?; tracing::info!("Computing histograms for numeric features"); let numeric_feature_stats: Vec = numeric_col_major .par_iter() .enumerate() .map(|(feat_index, vals)| { let name = numeric_names[feat_index]; let bounds = features::bounds_for(name) .with_context(|| format!("No bounds config for feature '{}'", name))?; let stats = compute_feature_stats(vals, bounds, features::has_integer_bins(name)); tracing::debug!( feature = %name, slider_min = format_args!("{:.2}", stats.slider_min), slider_max = format_args!("{:.2}", stats.slider_max), bins = stats.histogram.counts.len(), "Feature stats" ); Ok(stats) }) .collect::>>()?; // Compute quantization parameters from feature stats (numeric features). // For features with Fixed bounds, use those bounds so the full configured range // is representable — the histogram refinement can narrow min/max to exclude // "outliers" that are actually valid data (e.g. ethnicity percentages). // For Percentile-bounded features, use the (possibly refined) histogram range // so extreme outliers don't destroy precision for the main distribution. let mut quant_min = Vec::with_capacity(num_features); let mut quant_range = Vec::with_capacity(num_features); for (feat_idx, stats) in numeric_feature_stats.iter().enumerate() { let (min, max) = match features::bounds_for(numeric_names[feat_idx]) { Some(Bounds::Fixed { min, max }) => (*min, *max), _ => (stats.histogram.min, stats.histogram.max), }; quant_min.push(min); quant_range.push(if max > min { max - min } else { 0.0 }); } tracing::info!("Extracting string columns"); let extract_string_col = |df: &DataFrame, name: &str| -> anyhow::Result> { let column = df .column(name) .with_context(|| format!("Required column '{name}' not found in parquet"))?; let string_column = column .str() .with_context(|| format!("Column '{name}' is not a string column"))?; Ok(string_column .into_iter() .map(|value| value.unwrap_or("").to_string()) .collect()) }; let address_raw = extract_string_col(&df, "Address per Property Register")?; let postcode_raw = extract_string_col(&df, "Postcode")?; // Extract optional string columns let extract_optional_string_col = |df: &DataFrame, name: &str| -> anyhow::Result>> { if let Ok(column) = df.column(name) { let string_column = column .str() .with_context(|| format!("Column '{name}' is not a string column"))?; Ok(string_column .into_iter() .map(|value| { value.and_then(|s| { let trimmed = s.trim(); if trimmed.is_empty() { None } else { Some(trimmed.to_string()) } }) }) .collect()) } else { Ok(vec![None; row_count]) } }; let property_sub_type_raw = extract_optional_string_col(&df, "Property sub-type")?; let price_qualifier_raw = extract_optional_string_col(&df, "Price qualifier")?; tracing::info!("Building enum features"); // enum_col_major: Vec<(values_list, encoded_as_f32)> let enum_col_major: Vec<(Vec, Vec)> = enum_names .par_iter() .filter_map(|&name| { let column_data = df.column(name).ok()?; let string_column = column_data.str().ok()?; let unique_set: std::collections::HashSet = string_column .into_iter() .filter_map(|value| { let text = value.unwrap_or(""); if text.is_empty() { None } else { Some(text.to_string()) } }) .collect(); // Use configured order if available, otherwise alphabetical let unique: Vec = if let Some(order) = features::order_for(name) { let mut ordered: Vec = Vec::new(); for &ordered_value in order { if unique_set.contains(ordered_value) { ordered.push(ordered_value.to_string()); } } // Append any values not in the configured order, alphabetically // Use HashSet for O(1) contains instead of O(n) slice search let order_set: rustc_hash::FxHashSet<&str> = order.iter().copied().collect(); let mut remainder: Vec = unique_set .iter() .filter(|value| !order_set.contains(value.as_str())) .cloned() .collect(); remainder.sort(); ordered.extend(remainder); ordered } else { let mut sorted: Vec = unique_set.into_iter().collect(); sorted.sort(); sorted }; let value_to_idx: std::collections::HashMap<&str, f32> = unique .iter() .enumerate() .map(|(index, value)| (value.as_str(), index as f32)) .collect(); let encoded: Vec = string_column .into_iter() .map(|value| { let text = value.unwrap_or(""); if text.is_empty() { f32::NAN } else { *value_to_idx.get(text).unwrap_or(&f32::NAN) } }) .collect(); tracing::debug!(column = %name, unique_values = unique.len(), "Enum feature encoded as f32"); Some((unique, encoded)) }) .collect(); // Extract is_approx_build_date: 0.0 = exact, anything else (1.0/NaN) = approximate let is_approx_build_date_raw: Vec = if has_approx_col { let column_data = df .column("Is construction date approximate") .context("Missing 'Is construction date approximate' column")?; let float_series = column_data .cast(&DataType::Float32) .context("Failed to cast 'Is construction date approximate' to Float32")?; let chunked = float_series .f32() .context("Failed to read 'Is construction date approximate' as f32")?; chunked .into_iter() .map(|value| match value { Some(0.0) => false, _ => true, // 1.0 or NaN → approximate }) .collect() } else { vec![true; row_count] // default: all approximate }; // Extract renovation_history: List let mut renovation_raw: FxHashMap> = if has_renovation_history { tracing::info!("Extracting renovation history"); let reno_col = df .column("renovation_history") .context("Missing renovation_history column")?; let list_ca = reno_col .list() .context("renovation_history is not a list column")?; let mut history: FxHashMap> = FxHashMap::default(); for old_row in 0..row_count { if let Some(inner) = list_ca.get_as_series(old_row) { if inner.is_empty() { continue; } let structs = inner .struct_() .context("renovation_history inner is not a struct")?; let years = structs .field_by_name("year") .context("Missing 'year' field in renovation_history struct")?; let events = structs .field_by_name("event") .context("Missing 'event' field in renovation_history struct")?; let mut row_events = Vec::new(); for idx in 0..inner.len() { let year = years.get(idx).context("Failed to get year value")?; let event = events.get(idx).context("Failed to get event value")?; if let (AnyValue::Int32(yr), AnyValue::String(ev)) = (&year, &event) { row_events.push(RenovationEvent { year: *yr, event: ev.to_string(), }); } } if !row_events.is_empty() { history.insert(old_row as u32, row_events); } } } tracing::info!( properties_with_events = history.len(), "Renovation history extracted" ); history } else { FxHashMap::default() }; // Sort all rows by spatial locality so that grid queries access // contiguous memory (sequential reads instead of random DRAM accesses). tracing::info!("Sorting rows by spatial locality"); let grid_cell_size = 0.01_f32; let min_lat_val = lat.iter().cloned().fold(f32::INFINITY, f32::min) - grid_cell_size; let min_lon_val = lon.iter().cloned().fold(f32::INFINITY, f32::min) - grid_cell_size; let max_lon_val = lon.iter().cloned().fold(f32::NEG_INFINITY, f32::max) + grid_cell_size; let grid_cols = ((max_lon_val - min_lon_val) / grid_cell_size).ceil() as u64 + 1; let mut perm: Vec = (0..row_count as u32).collect(); perm.par_sort_unstable_by_key(|&perm_index| { let grid_row = ((lat[perm_index as usize] - min_lat_val) / grid_cell_size) as u64; let grid_col = ((lon[perm_index as usize] - min_lon_val) / grid_cell_size) as u64; grid_row * grid_cols + grid_col }); let lat: Vec = perm .iter() .map(|&perm_index| lat[perm_index as usize]) .collect(); let lon: Vec = perm .iter() .map(|&perm_index| lon[perm_index as usize]) .collect(); let last_known_price_raw: Vec = numeric_names .iter() .position(|&name| name == "Last known price") .map(|price_idx| { perm.iter() .map(|&perm_index| numeric_col_major[price_idx][perm_index as usize]) .collect() }) .unwrap_or_else(|| vec![f32::NAN; row_count]); // Build contiguous address buffer and address search index (permuted) tracing::info!("Building interned strings"); let total_addr_bytes: usize = address_raw.iter().map(|text| text.len()).sum(); let mut address_buffer = String::with_capacity(total_addr_bytes); let mut address_offsets = Vec::with_capacity(row_count); let mut address_lengths = Vec::with_capacity(row_count); let mut address_token_index: FxHashMap> = FxHashMap::default(); let mut address_search_rodeo = lasso::Rodeo::default(); let mut address_search_token_keys: Vec = Vec::new(); let mut address_search_token_offsets = Vec::with_capacity(row_count); let mut address_search_token_lengths = Vec::with_capacity(row_count); for (new_row, &perm_index) in perm.iter().enumerate() { let addr = &address_raw[perm_index as usize]; let offset = address_buffer.len() as u32; let length = addr.len().min(u16::MAX as usize) as u16; address_offsets.push(offset); address_lengths.push(length); address_buffer.push_str(&addr[..length as usize]); let search_tokens = address_search_tokens(addr); let token_offset = address_search_token_keys.len() as u32; let token_length = search_tokens.len().min(u16::MAX as usize) as u16; address_search_token_offsets.push(token_offset); address_search_token_lengths.push(token_length); for token in search_tokens.iter().take(token_length as usize) { let key = address_search_rodeo.get_or_intern(token); address_search_token_keys.push(key); if is_address_candidate_token(token) { address_token_index .entry(token.clone()) .or_default() .push(new_row as u32); } } } let address_token_count_before_prune = address_token_index.len(); address_token_index.retain(|_, rows| rows.len() <= ADDRESS_SEARCH_MAX_POSTINGS_PER_TOKEN); let address_prefix_index = build_address_prefix_index(&address_token_index); let address_search_interner = address_search_rodeo.into_reader(); let address_postings_count: usize = address_token_index.values().map(Vec::len).sum(); tracing::info!( tokens = address_token_index.len(), prefixes = address_prefix_index.len(), pruned_tokens = address_token_count_before_prune.saturating_sub(address_token_index.len()), postings = address_postings_count, row_tokens = address_search_token_keys.len(), "Address search index built" ); // Intern postcodes (permuted) let mut postcode_rodeo = lasso::Rodeo::default(); let mut postcode_keys: Vec = Vec::with_capacity(row_count); let mut postcode_row_index: FxHashMap> = FxHashMap::default(); for (new_row, &perm_index) in perm.iter().enumerate() { let key = postcode_rodeo.get_or_intern(&postcode_raw[perm_index as usize]); postcode_keys.push(key); postcode_row_index .entry(key) .or_default() .push(new_row as u32); } let postcode_interner = postcode_rodeo.into_reader(); // Pack is_approx_build_date into a bitvec (8 bools per byte) let num_bytes = row_count.div_ceil(8); let mut approx_build_date_bits = vec![0u8; num_bytes]; for (new_row, &old_row) in perm.iter().enumerate() { if is_approx_build_date_raw[old_row as usize] { approx_build_date_bits[new_row / 8] |= 1 << (new_row % 8); } } // Re-key renovation_history by permuted row index let renovation_history: FxHashMap> = { let mut map = FxHashMap::with_capacity_and_hasher(renovation_raw.len(), Default::default()); for (new_row, &old_row) in perm.iter().enumerate() { if let Some(events) = renovation_raw.remove(&old_row) { map.insert(new_row as u32, events); } } map }; // Permute optional string columns into sparse HashMaps let property_sub_type: FxHashMap = { let mut map = FxHashMap::default(); for (new_row, &old_row) in perm.iter().enumerate() { if let Some(ref s) = property_sub_type_raw[old_row as usize] { map.insert(new_row as u32, s.clone()); } } map }; let price_qualifier: FxHashMap = { let mut map = FxHashMap::default(); for (new_row, &old_row) in perm.iter().enumerate() { if let Some(ref s) = price_qualifier_raw[old_row as usize] { map.insert(new_row as u32, s.clone()); } } map }; // Build enum_values map: feature_index -> list of string values // and enum_counts map: feature_index -> per-value global counts let mut enum_values: rustc_hash::FxHashMap> = rustc_hash::FxHashMap::default(); let mut enum_counts: rustc_hash::FxHashMap> = rustc_hash::FxHashMap::default(); for (enum_idx, (values, encoded)) in enum_col_major.iter().enumerate() { let feature_idx = num_numeric + enum_idx; enum_values.insert(feature_idx, values.clone()); let mut counts = vec![0u64; values.len()]; for &val in encoded { if val.is_finite() { let idx = val as usize; if idx < counts.len() { counts[idx] += 1; } } } enum_counts.insert(feature_idx, counts); } // Build feature_stats: numeric stats + placeholder stats for enums let mut feature_stats = numeric_feature_stats; for (values, _) in &enum_col_major { // For enum features, slider range is 0 to num_values-1 let num_values = values.len(); let max_val = num_values as f32; feature_stats.push(FeatureStats { slider_min: 0.0, slider_max: (num_values.saturating_sub(1)) as f32, histogram: Histogram { min: 0.0, max: max_val, p1: 0.0, p99: max_val, counts: vec![0; num_values.max(1)], }, }); // Enum features: not quantized, stored directly as u16 quant_min.push(0.0); quant_range.push(0.0); } let dequant_a: Vec = quant_range .iter() .map(|&r| if r > 0.0 { r / QUANT_SCALE } else { 0.0 }) .collect(); // Transpose to row-major AND apply spatial permutation in one pass. // Combines numeric and enum features into a single feature_data array, quantized to u16. tracing::info!("Transposing to row-major layout (spatially sorted, quantized to u16)"); let mut feature_data = vec![NAN_U16; row_count * num_features]; feature_data .par_chunks_mut(num_features) .enumerate() .for_each(|(new_row, row_slice)| { let old_index = perm[new_row] as usize; // Numeric features: quantize to u16 for (feat_idx, col_vec) in numeric_col_major.iter().enumerate() { let value = col_vec[old_index]; row_slice[feat_idx] = if value.is_finite() { let range = quant_range[feat_idx]; if range > 0.0 { let normalized = (value - quant_min[feat_idx]) / range; (normalized * QUANT_SCALE).round().clamp(0.0, QUANT_SCALE) as u16 } else { 0 } } else { NAN_U16 }; } // Enum features: store as u16 directly for (enum_idx, (_, encoded)) in enum_col_major.iter().enumerate() { let value = encoded[old_index]; row_slice[num_numeric + enum_idx] = if value.is_finite() { value as u16 } else { NAN_U16 }; } }); tracing::info!("Data loading complete"); Ok(PropertyData { lat, lon, feature_names, num_features, num_numeric, feature_data, dequant_a, quant_min, quant_range, feature_stats, last_known_price_raw, address_buffer, address_offsets, address_lengths, postcode_interner, postcode_keys, postcode_row_index, address_token_index, address_prefix_index, address_search_interner, address_search_token_keys, address_search_token_offsets, address_search_token_lengths, enum_values, enum_counts, approx_build_date_bits, renovation_history, property_sub_type, price_qualifier, }) } } #[cfg(test)] mod tests { use super::*; use crate::features::Bounds; fn make_fixed_bounds(min: f32, max: f32) -> Bounds { Bounds::Fixed { min, max } } fn make_percentile_bounds(low: f64, high: f64) -> Bounds { Bounds::Percentile { low, high } } #[test] fn full_postcode_detection_accepts_common_formats() { assert!(is_full_postcode_compact("SW1A1AA")); assert!(is_full_postcode_compact("E142DG")); assert!(is_full_postcode_compact("M11AE")); assert!(!is_full_postcode_compact("E14")); assert!(!is_full_postcode_compact("DOWNING")); assert!(!is_full_postcode_compact("10A")); } #[test] fn address_query_parsing_skips_postcodes_and_street_suffixes() { let parsed = parse_address_query("Flat 2, 10 Downing St, SW1A 2AA"); assert_eq!(parsed.full_postcode.as_deref(), Some("SW1A 2AA")); assert_eq!( parsed.numeric_terms, vec!["10".to_string(), "2".to_string()] ); assert_eq!(parsed.candidate_terms, vec!["downing".to_string()]); assert_eq!(parsed.text_groups.len(), 1); assert_eq!( parsed.text_groups[0].alternatives, vec!["downing".to_string()] ); } #[test] fn address_query_parsing_handles_compact_postcodes() { let parsed = parse_address_query("10 downing street sw1a1aa"); assert_eq!(parsed.full_postcode.as_deref(), Some("SW1A 1AA")); assert_eq!(parsed.numeric_terms, vec!["10".to_string()]); assert_eq!(parsed.candidate_terms, vec!["downing".to_string()]); } #[test] fn address_query_parsing_keeps_partial_terms_for_row_matching() { let parsed = parse_address_query("settlers cour"); assert_eq!(parsed.full_postcode, None); assert_eq!(parsed.numeric_terms, Vec::::new()); assert_eq!( parsed.candidate_terms, vec!["settlers".to_string(), "cour".to_string()] ); assert_eq!(parsed.text_groups.len(), 2); assert_eq!( parsed.text_groups[0].alternatives, vec!["settlers".to_string()] ); assert_eq!(parsed.text_groups[1].alternatives, vec!["cour".to_string()]); } #[test] fn address_search_tokens_keep_actual_address_terms_for_scoring() { let tokens = address_search_tokens("Flat 2, 10 Downing Cour"); assert_eq!( tokens, vec![ "10".to_string(), "2".to_string(), "cour".to_string(), "downing".to_string(), "flat".to_string() ] ); } #[test] fn address_prefix_index_finds_partial_address_terms() { let mut token_index: FxHashMap> = FxHashMap::default(); token_index.insert("downing".to_string(), vec![1]); token_index.insert("downton".to_string(), vec![2]); token_index.insert("market".to_string(), vec![3]); let prefix_index = build_address_prefix_index(&token_index); assert_eq!( prefix_index.get("down").cloned().unwrap_or_default(), vec!["downing".to_string(), "downton".to_string()] ); assert_eq!( prefix_index.get("downi").cloned().unwrap_or_default(), vec!["downing".to_string()] ); assert_eq!( prefix_index.get("downt").cloned().unwrap_or_default(), vec!["downton".to_string()] ); assert!(!prefix_index.contains_key("do")); } #[test] fn address_term_matching_allows_prefixes_and_aliases() { let tokens = tokenize_address_text("10 Downing Street"); let prefix_group = address_term_group("down").expect("prefix term should be searchable"); let alias_group = AddressTermGroup { alternatives: vec!["st".to_string(), "street".to_string()], }; assert!(address_tokens_match_group(&tokens, &prefix_group)); assert!(address_tokens_match_group(&tokens, &alias_group)); } #[test] fn address_term_matching_uses_actual_token_prefixes() { let tokens = tokenize_address_text("12 Settlers Court"); let prefix_group = address_term_group("cou").expect("partial term should be searchable"); assert!(address_tokens_match_group(&tokens, &prefix_group)); } #[test] fn histogram_empty_data() { let data: Vec = vec![]; let bounds = make_fixed_bounds(0.0, 100.0); let stats = compute_feature_stats(&data, &bounds, false); assert_eq!(stats.slider_min, 0.0); assert_eq!(stats.slider_max, 100.0); assert_eq!(stats.histogram.counts.iter().sum::(), 0); } #[test] fn histogram_single_value() { let data = vec![50.0_f32]; let bounds = make_fixed_bounds(0.0, 100.0); let stats = compute_feature_stats(&data, &bounds, false); assert_eq!(stats.histogram.min, 50.0); assert_eq!(stats.histogram.max, 50.0); assert_eq!(stats.histogram.counts.iter().sum::(), 1); } #[test] fn histogram_uniform_distribution() { let data: Vec = (0..100).map(|i| i as f32).collect(); let bounds = make_fixed_bounds(0.0, 100.0); let stats = compute_feature_stats(&data, &bounds, false); assert_eq!(stats.histogram.min, 0.0); assert_eq!(stats.histogram.max, 99.0); assert_eq!(stats.histogram.counts.iter().sum::(), 100); } #[test] fn histogram_with_nan_values() { let data = vec![10.0_f32, f32::NAN, 20.0, f32::NAN, 30.0]; let bounds = make_fixed_bounds(0.0, 100.0); let stats = compute_feature_stats(&data, &bounds, false); assert_eq!(stats.histogram.counts.iter().sum::(), 3); assert_eq!(stats.histogram.min, 10.0); assert_eq!(stats.histogram.max, 30.0); } #[test] fn histogram_all_nan() { let data = vec![f32::NAN, f32::NAN, f32::NAN]; let bounds = make_fixed_bounds(0.0, 100.0); let stats = compute_feature_stats(&data, &bounds, false); assert_eq!(stats.histogram.counts.iter().sum::(), 0); } #[test] fn histogram_all_same_value() { let data = vec![42.0_f32; 1000]; let bounds = make_fixed_bounds(0.0, 100.0); let stats = compute_feature_stats(&data, &bounds, false); assert_eq!(stats.histogram.min, 42.0); assert_eq!(stats.histogram.max, 42.0); assert_eq!(stats.histogram.p1, 42.0); assert_eq!(stats.histogram.p99, 42.0); assert_eq!(stats.histogram.counts.iter().sum::(), 1000); } #[test] fn histogram_percentile_bounds() { let mut data: Vec = vec![0.0]; // Low outlier data.extend((1..99).map(|i| 50.0 + i as f32 * 0.01)); data.push(1000.0); // High outlier let bounds = make_percentile_bounds(2.0, 98.0); let stats = compute_feature_stats(&data, &bounds, false); assert!(stats.slider_min > 0.0); assert!(stats.slider_max < 1000.0); } #[test] fn fixed_price_bounds_keep_slider_cap() { let data = vec![400_000.0_f32, 2_500_000.0, 3_750_000.0]; let bounds = make_fixed_bounds(0.0, 2_500_000.0); let stats = compute_feature_stats(&data, &bounds, false); assert_eq!(stats.slider_min, 0.0); assert_eq!(stats.slider_max, 2_500_000.0); } #[test] fn histogram_bin_for_value() { let hist = Histogram { min: 0.0, max: 100.0, p1: 10.0, p99: 90.0, counts: vec![0; 10], }; assert_eq!(hist.bin_for_value(5.0), 0); // Low outlier bin assert_eq!(hist.bin_for_value(95.0), 9); // High outlier bin let mid_value = 50.0; let bin = hist.bin_for_value(mid_value); assert!((1..=8).contains(&bin)); } #[test] fn histogram_middle_bin_width() { let hist = Histogram { min: 0.0, max: 100.0, p1: 10.0, p99: 90.0, counts: vec![0; 10], }; let expected_width = (90.0 - 10.0) / 8.0; assert!((hist.middle_bin_width() - expected_width).abs() < 0.001); } #[test] fn histogram_cardinality_caps_bins() { let data = vec![1.0_f32, 1.0, 2.0, 2.0, 3.0, 3.0]; let bounds = make_fixed_bounds(0.0, 100.0); let stats = compute_feature_stats(&data, &bounds, false); assert_eq!(stats.histogram.counts.len(), 3); } #[test] fn min_max_skips_nan() { let values = vec![10.0_f32, f32::NAN, 20.0, f32::NAN, 5.0]; let mut min = f32::INFINITY; let mut max = f32::NEG_INFINITY; for &v in &values { if v.is_finite() { if v < min { min = v; } if v > max { max = v; } } } assert_eq!(min, 5.0); assert_eq!(max, 20.0); } #[test] fn count_skips_nan() { let values = [1.0_f32, f32::NAN, 2.0, f32::NAN, 3.0]; let count = values.iter().filter(|v| v.is_finite()).count(); assert_eq!(count, 3); } #[test] fn enum_value_counting() { let values = vec![0.0_f32, 1.0, 1.0, 2.0, f32::NAN, 3.0, 1.0]; let enum_count = 4; let mut counts = vec![0u64; enum_count]; for &v in &values { if v.is_finite() { let idx = v as usize; if idx < enum_count { counts[idx] += 1; } } } assert_eq!(counts[0], 1); assert_eq!(counts[1], 3); assert_eq!(counts[2], 1); assert_eq!(counts[3], 1); } #[test] fn infinity_values_excluded() { let data = vec![f32::INFINITY, f32::NEG_INFINITY, 50.0]; let bounds = Bounds::Fixed { min: 0.0, max: 100.0, }; let stats = compute_feature_stats(&data, &bounds, false); assert_eq!(stats.histogram.min, 50.0); assert_eq!(stats.histogram.max, 50.0); assert_eq!(stats.histogram.counts.iter().sum::(), 1); } #[test] fn only_finite_values() { let data = vec![10.0_f32, 20.0, 30.0]; let bounds = Bounds::Fixed { min: 0.0, max: 100.0, }; let stats = compute_feature_stats(&data, &bounds, false); assert_eq!(stats.histogram.min, 10.0); assert_eq!(stats.histogram.max, 30.0); assert_eq!(stats.histogram.counts.iter().sum::(), 3); } #[test] fn extreme_outlier_does_not_destroy_quantization() { // Simulate floor area: 10k normal values (50-200 sqm) + one 317M outlier let mut data: Vec = (0..10_000).map(|i| 50.0 + (i % 150) as f32).collect(); data.push(317_000_000.0); // Extreme outlier from web scraping let bounds = make_percentile_bounds(0.0, 98.0); let stats = compute_feature_stats(&data, &bounds, false); // After refinement, histogram range should be much tighter than 317M assert!( stats.histogram.max < 1_000_000.0, "histogram.max should be refined, got {}", stats.histogram.max, ); // p1 should be near 50, not millions assert!( stats.histogram.p1 < 100.0, "p1 should be near real data, got {}", stats.histogram.p1, ); // Slider min should reflect actual data range assert!( stats.slider_min < 100.0, "slider_min should be near real data, got {}", stats.slider_min, ); // Quantization using histogram.min/max should give usable range let qmin = stats.histogram.min; let qrange = stats.histogram.max - stats.histogram.min; assert!(qrange > 0.0 && qrange < 1_000_000.0); // A typical floor area (100 sqm) should be distinguishable from min let normalized = (100.0 - qmin) / qrange; let encoded = (normalized * QUANT_SCALE).round() as u16; assert!( encoded > 100, "100 sqm should encode to a meaningful u16 value, got {}", encoded, ); } }