seems fine

This commit is contained in:
Andras Schmelczer 2026-05-05 22:29:28 +01:00
parent 48983e3b4b
commit 7a1696541f
37 changed files with 4999 additions and 1242 deletions

View file

@ -5,11 +5,16 @@ use rayon::prelude::*;
use serde::Serialize;
use std::path::Path;
use rustc_hash::FxHashMap;
use rustc_hash::{FxHashMap, FxHashSet};
use crate::consts::{H3_PRECOMPUTE_MAX, HISTOGRAM_BINS, NAN_U16, QUANT_SCALE};
use crate::features::{self, Bounds};
const ADDRESS_SEARCH_CANDIDATE_LIMIT: usize = 50_000;
const ADDRESS_SEARCH_MAX_POSTINGS_PER_TOKEN: usize = 250_000;
const ADDRESS_SEARCH_PREFIX_MIN_LEN: usize = 4;
const ADDRESS_SEARCH_PREFIX_MAX_LEN: usize = 8;
fn is_numeric_dtype(dtype: &DataType) -> bool {
matches!(
dtype,
@ -32,6 +37,360 @@ fn is_datetime_dtype(dtype: &DataType) -> bool {
matches!(dtype, DataType::Datetime(_, _) | DataType::Date)
}
#[derive(Clone, Debug)]
struct AddressTermGroup {
alternatives: Vec<String>,
}
#[derive(Debug)]
struct AddressQuery {
full_postcode: Option<String>,
text_groups: Vec<AddressTermGroup>,
numeric_terms: Vec<String>,
candidate_terms: Vec<String>,
}
fn tokenize_address_text(text: &str) -> Vec<String> {
let mut tokens = Vec::new();
let mut current = String::new();
for ch in text.chars() {
if ch.is_ascii_alphanumeric() {
current.push(ch.to_ascii_lowercase());
} else if matches!(ch, '\'' | '' | '`') {
continue;
} else if !current.is_empty() {
tokens.push(std::mem::take(&mut current));
}
}
if !current.is_empty() {
tokens.push(current);
}
tokens
}
fn is_full_postcode_compact(compact: &str) -> bool {
let bytes = compact.as_bytes();
let len = bytes.len();
if !(5..=7).contains(&len) {
return false;
}
let inward = &bytes[len - 3..];
if !inward[0].is_ascii_digit()
|| !inward[1].is_ascii_alphabetic()
|| !inward[2].is_ascii_alphabetic()
{
return false;
}
let outward = &bytes[..len - 3];
if !(2..=4).contains(&outward.len()) {
return false;
}
outward[0].is_ascii_alphabetic()
&& outward.iter().all(u8::is_ascii_alphanumeric)
&& outward.iter().any(u8::is_ascii_digit)
}
fn canonical_postcode_from_compact(compact: &str) -> String {
let upper = compact.to_ascii_uppercase();
let split = upper.len() - 3;
format!("{} {}", &upper[..split], &upper[split..])
}
fn extract_full_postcode(tokens: &[String]) -> Option<(String, Vec<usize>)> {
for (idx, token) in tokens.iter().enumerate() {
let compact = token.to_ascii_uppercase();
if is_full_postcode_compact(&compact) {
return Some((canonical_postcode_from_compact(&compact), vec![idx]));
}
}
for idx in 0..tokens.len().saturating_sub(1) {
let compact = format!(
"{}{}",
tokens[idx].to_ascii_uppercase(),
tokens[idx + 1].to_ascii_uppercase()
);
if is_full_postcode_compact(&compact) {
return Some((
canonical_postcode_from_compact(&compact),
vec![idx, idx + 1],
));
}
}
None
}
fn looks_like_postcode_fragment(token: &str) -> bool {
(2..=4).contains(&token.len())
&& token
.chars()
.next()
.is_some_and(|ch| ch.is_ascii_alphabetic())
&& token.chars().any(|ch| ch.is_ascii_digit())
&& token.chars().all(|ch| ch.is_ascii_alphanumeric())
}
fn is_numeric_address_token(token: &str) -> bool {
token.chars().all(|ch| ch.is_ascii_digit())
}
fn address_token_aliases(token: &str) -> Vec<&'static str> {
match token {
"apt" => vec!["apt", "apartment"],
"apartment" => vec!["apartment", "apt"],
"ave" => vec!["ave", "avenue"],
"avenue" => vec!["avenue", "ave"],
"blvd" => vec!["blvd", "boulevard"],
"boulevard" => vec!["boulevard", "blvd"],
"cl" => vec!["cl", "close"],
"close" => vec!["close", "cl"],
"ct" => vec!["ct", "court"],
"court" => vec!["court", "ct"],
"cres" => vec!["cres", "crescent"],
"crescent" => vec!["crescent", "cres"],
"dr" => vec!["dr", "drive"],
"drive" => vec!["drive", "dr"],
"fl" => vec!["fl", "flat"],
"flat" => vec!["flat", "fl"],
"gdns" => vec!["gdns", "gardens", "garden"],
"garden" => vec!["garden", "gardens", "gdns"],
"gardens" => vec!["gardens", "garden", "gdns"],
"hse" => vec!["hse", "house"],
"house" => vec!["house", "hse"],
"ln" => vec!["ln", "lane"],
"lane" => vec!["lane", "ln"],
"rd" => vec!["rd", "road"],
"road" => vec!["road", "rd"],
"sq" => vec!["sq", "square"],
"square" => vec!["square", "sq"],
"st" => vec!["st", "street", "saint"],
"street" => vec!["street", "st"],
"saint" => vec!["saint", "st"],
"terr" => vec!["terr", "terrace"],
"terrace" => vec!["terrace", "terr"],
_ => Vec::new(),
}
}
fn is_address_stop_token(token: &str) -> bool {
matches!(
token,
"a" | "an"
| "and"
| "apartment"
| "apt"
| "avenue"
| "ave"
| "block"
| "building"
| "bungalow"
| "close"
| "cl"
| "court"
| "ct"
| "cres"
| "crescent"
| "drive"
| "dr"
| "estate"
| "flat"
| "fl"
| "floor"
| "garden"
| "gardens"
| "gdns"
| "grove"
| "house"
| "hse"
| "lane"
| "ln"
| "lodge"
| "mansions"
| "mews"
| "of"
| "park"
| "place"
| "road"
| "rd"
| "room"
| "row"
| "saint"
| "sq"
| "square"
| "st"
| "street"
| "terr"
| "terrace"
| "the"
| "unit"
| "view"
| "villas"
| "walk"
| "way"
| "yard"
)
}
fn address_term_group(token: &str) -> Option<AddressTermGroup> {
if token.len() < 3 || is_numeric_address_token(token) || looks_like_postcode_fragment(token) {
return None;
}
let mut alternatives = Vec::new();
alternatives.push(token.to_string());
for alias in address_token_aliases(token) {
if !alternatives.iter().any(|existing| existing == alias) {
alternatives.push(alias.to_string());
}
}
if alternatives
.iter()
.all(|alternative| is_address_stop_token(alternative))
{
return None;
}
Some(AddressTermGroup { alternatives })
}
fn address_search_tokens(text: &str) -> Vec<String> {
let mut tokens: Vec<String> = tokenize_address_text(text)
.into_iter()
.filter(|token| is_address_search_token(token))
.collect();
tokens.sort_unstable();
tokens.dedup();
tokens
}
fn is_address_search_token(token: &str) -> bool {
if looks_like_postcode_fragment(token) {
return false;
}
if is_numeric_address_token(token) {
return true;
}
if token.chars().any(|ch| ch.is_ascii_digit()) {
return token.len() >= 2;
}
token.len() >= 3
}
fn is_address_candidate_token(token: &str) -> bool {
!is_numeric_address_token(token)
&& !looks_like_postcode_fragment(token)
&& (token.chars().any(|ch| ch.is_ascii_digit())
|| (token.len() >= 3 && !is_address_stop_token(token)))
}
fn address_prefix_key(term: &str) -> &str {
if term.len() > ADDRESS_SEARCH_PREFIX_MAX_LEN {
&term[..ADDRESS_SEARCH_PREFIX_MAX_LEN]
} else {
term
}
}
fn build_address_prefix_index(
address_token_index: &FxHashMap<String, Vec<u32>>,
) -> FxHashMap<String, Vec<String>> {
let mut prefix_index: FxHashMap<String, Vec<String>> = FxHashMap::default();
for token in address_token_index.keys() {
let max_prefix_len = token.len().min(ADDRESS_SEARCH_PREFIX_MAX_LEN);
for prefix_len in ADDRESS_SEARCH_PREFIX_MIN_LEN..=max_prefix_len {
prefix_index
.entry(token[..prefix_len].to_string())
.or_default()
.push(token.clone());
}
}
for tokens in prefix_index.values_mut() {
tokens.sort_unstable();
tokens.dedup();
}
prefix_index
}
fn parse_address_query(query: &str) -> AddressQuery {
let tokens = tokenize_address_text(query);
let (full_postcode, postcode_token_indices) = extract_full_postcode(&tokens)
.map(|(postcode, indices)| (Some(postcode), indices))
.unwrap_or((None, Vec::new()));
let skip_postcode_tokens: FxHashSet<usize> = postcode_token_indices.into_iter().collect();
let mut text_groups = Vec::new();
let mut numeric_terms = Vec::new();
let mut candidate_terms = Vec::new();
for (idx, token) in tokens.iter().enumerate() {
if skip_postcode_tokens.contains(&idx) || looks_like_postcode_fragment(token) {
continue;
}
if is_numeric_address_token(token) {
numeric_terms.push(token.clone());
continue;
}
if let Some(group) = address_term_group(token) {
for alternative in &group.alternatives {
if !is_address_stop_token(alternative)
&& !candidate_terms.iter().any(|term| term == alternative)
{
candidate_terms.push(alternative.clone());
}
}
text_groups.push(group);
} else if token.chars().any(|ch| ch.is_ascii_digit()) && token.len() >= 2 {
numeric_terms.push(token.clone());
if !candidate_terms.iter().any(|term| term == token) {
candidate_terms.push(token.clone());
}
}
}
text_groups.dedup_by(|left, right| left.alternatives == right.alternatives);
numeric_terms.sort_unstable();
numeric_terms.dedup();
AddressQuery {
full_postcode,
text_groups,
numeric_terms,
candidate_terms,
}
}
fn token_matches_query_term(token: &str, query_term: &str) -> bool {
token == query_term || (query_term.len() >= 3 && token.starts_with(query_term))
}
fn token_matches_numeric_term(token: &str, query_term: &str) -> bool {
token == query_term || token.starts_with(query_term)
}
fn address_tokens_match_group(tokens: &[String], group: &AddressTermGroup) -> bool {
group.alternatives.iter().any(|alternative| {
tokens
.iter()
.any(|token| token_matches_query_term(token, alternative))
})
}
/// Histogram with outlier buckets at the edges.
/// - Bin 0: [min, p1) — low outliers
/// - Bins 1 to n-2: [p1, p99) — main distribution, evenly divided
@ -163,6 +522,20 @@ pub struct PropertyData {
/// Interned postcodes: reader is thread-safe, keys index into it.
postcode_interner: lasso::RodeoReader,
postcode_keys: Vec<lasso::Spur>,
/// Rows for each postcode, keyed by the interned postcode key.
postcode_row_index: FxHashMap<lasso::Spur, Vec<u32>>,
/// Inverted index from address tokens to property rows.
address_token_index: FxHashMap<String, Vec<u32>>,
/// Prefix lookup from typed address-token prefix to indexed full address tokens.
address_prefix_index: FxHashMap<String, Vec<String>>,
/// Interned normalized address-search tokens used for per-row scoring.
address_search_interner: lasso::RodeoReader,
/// Flat per-row normalized address-search token keys.
address_search_token_keys: Vec<lasso::Spur>,
/// Offset into `address_search_token_keys` for each row.
address_search_token_offsets: Vec<u32>,
/// Number of normalized address-search token keys for each row.
address_search_token_lengths: Vec<u16>,
/// For enum features: maps feature index to list of possible string values.
/// Index in values list corresponds to the u16 value stored in feature_data.
pub enum_values: rustc_hash::FxHashMap<usize, Vec<String>>,
@ -197,6 +570,164 @@ impl PropertyData {
(&self.postcode_interner, &self.postcode_keys)
}
fn row_address_search_tokens(&self, row: usize) -> &[lasso::Spur] {
let offset = self.address_search_token_offsets[row] as usize;
let length = self.address_search_token_lengths[row] as usize;
&self.address_search_token_keys[offset..offset + length]
}
/// Search individual property addresses. Full postcode queries use a direct row index;
/// free-text queries use a small inverted index over distinctive address tokens.
pub fn search_addresses(&self, query: &str, limit: usize) -> Vec<usize> {
if limit == 0 {
return Vec::new();
}
let parsed = parse_address_query(query);
if parsed.full_postcode.is_none()
&& parsed.text_groups.is_empty()
&& parsed.numeric_terms.is_empty()
{
return Vec::new();
}
let candidate_rows: Vec<u32> = if let Some(postcode) = parsed.full_postcode.as_deref() {
self.postcode_interner
.get(postcode)
.and_then(|key| self.postcode_row_index.get(&key))
.map(|rows| rows.to_vec())
.unwrap_or_default()
} else if let Some(rows) = self.best_address_token_rows(&parsed.candidate_terms) {
rows.iter()
.take(ADDRESS_SEARCH_CANDIDATE_LIMIT)
.copied()
.collect()
} else {
Vec::new()
};
if candidate_rows.is_empty() {
return Vec::new();
}
let mut scored: Vec<(i32, usize, usize)> = candidate_rows
.into_iter()
.filter_map(|row| {
let row = row as usize;
self.address_match_score(row, &parsed)
.map(|score| (score, self.address(row).len(), row))
})
.collect();
scored.sort_unstable_by(|left, right| {
right
.0
.cmp(&left.0)
.then(left.1.cmp(&right.1))
.then(left.2.cmp(&right.2))
});
let mut seen = FxHashSet::default();
let mut results = Vec::with_capacity(limit);
for (_, _, row) in scored {
let address = self.address(row).trim();
if address.is_empty() {
continue;
}
let key = format!("{}\n{}", address.to_ascii_lowercase(), self.postcode(row));
if !seen.insert(key) {
continue;
}
results.push(row);
if results.len() == limit {
break;
}
}
results
}
fn best_address_token_rows(&self, terms: &[String]) -> Option<&[u32]> {
let mut best: Option<&[u32]> = None;
for term in terms {
if let Some(rows) = self.address_token_index.get(term) {
if best.map_or(true, |current| rows.len() < current.len()) {
best = Some(rows.as_slice());
}
continue;
}
if term.len() < 4 {
continue;
}
if let Some(tokens) = self.address_prefix_index.get(address_prefix_key(term)) {
for token in tokens {
if !token.starts_with(term) {
continue;
}
if let Some(rows) = self.address_token_index.get(token) {
if best.map_or(true, |current| rows.len() < current.len()) {
best = Some(rows.as_slice());
}
}
}
}
}
best
}
fn address_match_score(&self, row: usize, parsed: &AddressQuery) -> Option<i32> {
if self.address(row).trim().is_empty() {
return None;
}
let tokens = self.row_address_search_tokens(row);
if parsed
.text_groups
.iter()
.any(|group| !self.address_tokens_match_group(tokens, group))
{
return None;
}
let numeric_matches = parsed
.numeric_terms
.iter()
.filter(|term| {
tokens.iter().any(|token| {
token_matches_numeric_term(self.address_search_interner.resolve(token), term)
})
})
.count();
if !parsed.numeric_terms.is_empty() && numeric_matches == 0 {
return None;
}
let mut score = 0;
if parsed.full_postcode.is_some() {
score += 1_000;
}
score += (parsed.text_groups.len() as i32) * 200;
score += (numeric_matches as i32) * 90;
if numeric_matches == parsed.numeric_terms.len() && numeric_matches > 0 {
score += 50;
}
Some(score)
}
fn address_tokens_match_group(&self, tokens: &[lasso::Spur], group: &AddressTermGroup) -> bool {
group.alternatives.iter().any(|alternative| {
tokens.iter().any(|token| {
token_matches_query_term(self.address_search_interner.resolve(token), alternative)
})
})
}
/// Get the is_approx_build_date flag for a given row (bit-packed).
pub fn is_approx_build_date(&self, row: usize) -> bool {
let byte = self.approx_build_date_bits[row / 8];
@ -946,27 +1477,70 @@ impl PropertyData {
.map(|&perm_index| lon[perm_index as usize])
.collect();
// Build contiguous address buffer (permuted)
// Build contiguous address buffer and address search index (permuted)
tracing::info!("Building interned strings");
let total_addr_bytes: usize = address_raw.iter().map(|text| text.len()).sum();
let mut address_buffer = String::with_capacity(total_addr_bytes);
let mut address_offsets = Vec::with_capacity(row_count);
let mut address_lengths = Vec::with_capacity(row_count);
for &perm_index in &perm {
let mut address_token_index: FxHashMap<String, Vec<u32>> = FxHashMap::default();
let mut address_search_rodeo = lasso::Rodeo::default();
let mut address_search_token_keys: Vec<lasso::Spur> = Vec::new();
let mut address_search_token_offsets = Vec::with_capacity(row_count);
let mut address_search_token_lengths = Vec::with_capacity(row_count);
for (new_row, &perm_index) in perm.iter().enumerate() {
let addr = &address_raw[perm_index as usize];
let offset = address_buffer.len() as u32;
let length = addr.len().min(u16::MAX as usize) as u16;
address_offsets.push(offset);
address_lengths.push(length);
address_buffer.push_str(&addr[..length as usize]);
let search_tokens = address_search_tokens(addr);
let token_offset = address_search_token_keys.len() as u32;
let token_length = search_tokens.len().min(u16::MAX as usize) as u16;
address_search_token_offsets.push(token_offset);
address_search_token_lengths.push(token_length);
for token in search_tokens.iter().take(token_length as usize) {
let key = address_search_rodeo.get_or_intern(token);
address_search_token_keys.push(key);
if is_address_candidate_token(token) {
address_token_index
.entry(token.clone())
.or_default()
.push(new_row as u32);
}
}
}
let address_token_count_before_prune = address_token_index.len();
address_token_index.retain(|_, rows| rows.len() <= ADDRESS_SEARCH_MAX_POSTINGS_PER_TOKEN);
let address_prefix_index = build_address_prefix_index(&address_token_index);
let address_search_interner = address_search_rodeo.into_reader();
let address_postings_count: usize = address_token_index.values().map(Vec::len).sum();
tracing::info!(
tokens = address_token_index.len(),
prefixes = address_prefix_index.len(),
pruned_tokens =
address_token_count_before_prune.saturating_sub(address_token_index.len()),
postings = address_postings_count,
row_tokens = address_search_token_keys.len(),
"Address search index built"
);
// Intern postcodes (permuted)
let mut postcode_rodeo = lasso::Rodeo::default();
let postcode_keys: Vec<lasso::Spur> = perm
.iter()
.map(|&perm_index| postcode_rodeo.get_or_intern(&postcode_raw[perm_index as usize]))
.collect();
let mut postcode_keys: Vec<lasso::Spur> = Vec::with_capacity(row_count);
let mut postcode_row_index: FxHashMap<lasso::Spur, Vec<u32>> = FxHashMap::default();
for (new_row, &perm_index) in perm.iter().enumerate() {
let key = postcode_rodeo.get_or_intern(&postcode_raw[perm_index as usize]);
postcode_keys.push(key);
postcode_row_index
.entry(key)
.or_default()
.push(new_row as u32);
}
let postcode_interner = postcode_rodeo.into_reader();
// Pack is_approx_build_date into a bitvec (8 bools per byte)
@ -1110,6 +1684,13 @@ impl PropertyData {
address_lengths,
postcode_interner,
postcode_keys,
postcode_row_index,
address_token_index,
address_prefix_index,
address_search_interner,
address_search_token_keys,
address_search_token_offsets,
address_search_token_lengths,
enum_values,
enum_counts,
approx_build_date_bits,
@ -1133,6 +1714,120 @@ mod tests {
Bounds::Percentile { low, high }
}
#[test]
fn full_postcode_detection_accepts_common_formats() {
assert!(is_full_postcode_compact("SW1A1AA"));
assert!(is_full_postcode_compact("E142DG"));
assert!(is_full_postcode_compact("M11AE"));
assert!(!is_full_postcode_compact("E14"));
assert!(!is_full_postcode_compact("DOWNING"));
assert!(!is_full_postcode_compact("10A"));
}
#[test]
fn address_query_parsing_skips_postcodes_and_street_suffixes() {
let parsed = parse_address_query("Flat 2, 10 Downing St, SW1A 2AA");
assert_eq!(parsed.full_postcode.as_deref(), Some("SW1A 2AA"));
assert_eq!(
parsed.numeric_terms,
vec!["10".to_string(), "2".to_string()]
);
assert_eq!(parsed.candidate_terms, vec!["downing".to_string()]);
assert_eq!(parsed.text_groups.len(), 1);
assert_eq!(
parsed.text_groups[0].alternatives,
vec!["downing".to_string()]
);
}
#[test]
fn address_query_parsing_handles_compact_postcodes() {
let parsed = parse_address_query("10 downing street sw1a1aa");
assert_eq!(parsed.full_postcode.as_deref(), Some("SW1A 1AA"));
assert_eq!(parsed.numeric_terms, vec!["10".to_string()]);
assert_eq!(parsed.candidate_terms, vec!["downing".to_string()]);
}
#[test]
fn address_query_parsing_keeps_partial_terms_for_row_matching() {
let parsed = parse_address_query("settlers cour");
assert_eq!(parsed.full_postcode, None);
assert_eq!(parsed.numeric_terms, Vec::<String>::new());
assert_eq!(
parsed.candidate_terms,
vec!["settlers".to_string(), "cour".to_string()]
);
assert_eq!(parsed.text_groups.len(), 2);
assert_eq!(
parsed.text_groups[0].alternatives,
vec!["settlers".to_string()]
);
assert_eq!(parsed.text_groups[1].alternatives, vec!["cour".to_string()]);
}
#[test]
fn address_search_tokens_keep_actual_address_terms_for_scoring() {
let tokens = address_search_tokens("Flat 2, 10 Downing Cour");
assert_eq!(
tokens,
vec![
"10".to_string(),
"2".to_string(),
"cour".to_string(),
"downing".to_string(),
"flat".to_string()
]
);
}
#[test]
fn address_prefix_index_finds_partial_address_terms() {
let mut token_index: FxHashMap<String, Vec<u32>> = FxHashMap::default();
token_index.insert("downing".to_string(), vec![1]);
token_index.insert("downton".to_string(), vec![2]);
token_index.insert("market".to_string(), vec![3]);
let prefix_index = build_address_prefix_index(&token_index);
assert_eq!(
prefix_index.get("down").cloned().unwrap_or_default(),
vec!["downing".to_string(), "downton".to_string()]
);
assert_eq!(
prefix_index.get("downi").cloned().unwrap_or_default(),
vec!["downing".to_string()]
);
assert_eq!(
prefix_index.get("downt").cloned().unwrap_or_default(),
vec!["downton".to_string()]
);
assert!(!prefix_index.contains_key("do"));
}
#[test]
fn address_term_matching_allows_prefixes_and_aliases() {
let tokens = tokenize_address_text("10 Downing Street");
let prefix_group = address_term_group("down").expect("prefix term should be searchable");
let alias_group = AddressTermGroup {
alternatives: vec!["st".to_string(), "street".to_string()],
};
assert!(address_tokens_match_group(&tokens, &prefix_group));
assert!(address_tokens_match_group(&tokens, &alias_group));
}
#[test]
fn address_term_matching_uses_actual_token_prefixes() {
let tokens = tokenize_address_text("12 Settlers Court");
let prefix_group = address_term_group("cou").expect("partial term should be searchable");
assert!(address_tokens_match_group(&tokens, &prefix_group));
}
#[test]
fn histogram_empty_data() {
let data: Vec<f32> = vec![];