seems fine
This commit is contained in:
parent
48983e3b4b
commit
7a1696541f
37 changed files with 4999 additions and 1242 deletions
|
|
@ -5,11 +5,16 @@ use rayon::prelude::*;
|
|||
use serde::Serialize;
|
||||
use std::path::Path;
|
||||
|
||||
use rustc_hash::FxHashMap;
|
||||
use rustc_hash::{FxHashMap, FxHashSet};
|
||||
|
||||
use crate::consts::{H3_PRECOMPUTE_MAX, HISTOGRAM_BINS, NAN_U16, QUANT_SCALE};
|
||||
use crate::features::{self, Bounds};
|
||||
|
||||
const ADDRESS_SEARCH_CANDIDATE_LIMIT: usize = 50_000;
|
||||
const ADDRESS_SEARCH_MAX_POSTINGS_PER_TOKEN: usize = 250_000;
|
||||
const ADDRESS_SEARCH_PREFIX_MIN_LEN: usize = 4;
|
||||
const ADDRESS_SEARCH_PREFIX_MAX_LEN: usize = 8;
|
||||
|
||||
fn is_numeric_dtype(dtype: &DataType) -> bool {
|
||||
matches!(
|
||||
dtype,
|
||||
|
|
@ -32,6 +37,360 @@ fn is_datetime_dtype(dtype: &DataType) -> bool {
|
|||
matches!(dtype, DataType::Datetime(_, _) | DataType::Date)
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct AddressTermGroup {
|
||||
alternatives: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct AddressQuery {
|
||||
full_postcode: Option<String>,
|
||||
text_groups: Vec<AddressTermGroup>,
|
||||
numeric_terms: Vec<String>,
|
||||
candidate_terms: Vec<String>,
|
||||
}
|
||||
|
||||
fn tokenize_address_text(text: &str) -> Vec<String> {
|
||||
let mut tokens = Vec::new();
|
||||
let mut current = String::new();
|
||||
|
||||
for ch in text.chars() {
|
||||
if ch.is_ascii_alphanumeric() {
|
||||
current.push(ch.to_ascii_lowercase());
|
||||
} else if matches!(ch, '\'' | '’' | '`') {
|
||||
continue;
|
||||
} else if !current.is_empty() {
|
||||
tokens.push(std::mem::take(&mut current));
|
||||
}
|
||||
}
|
||||
|
||||
if !current.is_empty() {
|
||||
tokens.push(current);
|
||||
}
|
||||
|
||||
tokens
|
||||
}
|
||||
|
||||
fn is_full_postcode_compact(compact: &str) -> bool {
|
||||
let bytes = compact.as_bytes();
|
||||
let len = bytes.len();
|
||||
if !(5..=7).contains(&len) {
|
||||
return false;
|
||||
}
|
||||
|
||||
let inward = &bytes[len - 3..];
|
||||
if !inward[0].is_ascii_digit()
|
||||
|| !inward[1].is_ascii_alphabetic()
|
||||
|| !inward[2].is_ascii_alphabetic()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
let outward = &bytes[..len - 3];
|
||||
if !(2..=4).contains(&outward.len()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
outward[0].is_ascii_alphabetic()
|
||||
&& outward.iter().all(u8::is_ascii_alphanumeric)
|
||||
&& outward.iter().any(u8::is_ascii_digit)
|
||||
}
|
||||
|
||||
fn canonical_postcode_from_compact(compact: &str) -> String {
|
||||
let upper = compact.to_ascii_uppercase();
|
||||
let split = upper.len() - 3;
|
||||
format!("{} {}", &upper[..split], &upper[split..])
|
||||
}
|
||||
|
||||
fn extract_full_postcode(tokens: &[String]) -> Option<(String, Vec<usize>)> {
|
||||
for (idx, token) in tokens.iter().enumerate() {
|
||||
let compact = token.to_ascii_uppercase();
|
||||
if is_full_postcode_compact(&compact) {
|
||||
return Some((canonical_postcode_from_compact(&compact), vec![idx]));
|
||||
}
|
||||
}
|
||||
|
||||
for idx in 0..tokens.len().saturating_sub(1) {
|
||||
let compact = format!(
|
||||
"{}{}",
|
||||
tokens[idx].to_ascii_uppercase(),
|
||||
tokens[idx + 1].to_ascii_uppercase()
|
||||
);
|
||||
if is_full_postcode_compact(&compact) {
|
||||
return Some((
|
||||
canonical_postcode_from_compact(&compact),
|
||||
vec![idx, idx + 1],
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
fn looks_like_postcode_fragment(token: &str) -> bool {
|
||||
(2..=4).contains(&token.len())
|
||||
&& token
|
||||
.chars()
|
||||
.next()
|
||||
.is_some_and(|ch| ch.is_ascii_alphabetic())
|
||||
&& token.chars().any(|ch| ch.is_ascii_digit())
|
||||
&& token.chars().all(|ch| ch.is_ascii_alphanumeric())
|
||||
}
|
||||
|
||||
fn is_numeric_address_token(token: &str) -> bool {
|
||||
token.chars().all(|ch| ch.is_ascii_digit())
|
||||
}
|
||||
|
||||
fn address_token_aliases(token: &str) -> Vec<&'static str> {
|
||||
match token {
|
||||
"apt" => vec!["apt", "apartment"],
|
||||
"apartment" => vec!["apartment", "apt"],
|
||||
"ave" => vec!["ave", "avenue"],
|
||||
"avenue" => vec!["avenue", "ave"],
|
||||
"blvd" => vec!["blvd", "boulevard"],
|
||||
"boulevard" => vec!["boulevard", "blvd"],
|
||||
"cl" => vec!["cl", "close"],
|
||||
"close" => vec!["close", "cl"],
|
||||
"ct" => vec!["ct", "court"],
|
||||
"court" => vec!["court", "ct"],
|
||||
"cres" => vec!["cres", "crescent"],
|
||||
"crescent" => vec!["crescent", "cres"],
|
||||
"dr" => vec!["dr", "drive"],
|
||||
"drive" => vec!["drive", "dr"],
|
||||
"fl" => vec!["fl", "flat"],
|
||||
"flat" => vec!["flat", "fl"],
|
||||
"gdns" => vec!["gdns", "gardens", "garden"],
|
||||
"garden" => vec!["garden", "gardens", "gdns"],
|
||||
"gardens" => vec!["gardens", "garden", "gdns"],
|
||||
"hse" => vec!["hse", "house"],
|
||||
"house" => vec!["house", "hse"],
|
||||
"ln" => vec!["ln", "lane"],
|
||||
"lane" => vec!["lane", "ln"],
|
||||
"rd" => vec!["rd", "road"],
|
||||
"road" => vec!["road", "rd"],
|
||||
"sq" => vec!["sq", "square"],
|
||||
"square" => vec!["square", "sq"],
|
||||
"st" => vec!["st", "street", "saint"],
|
||||
"street" => vec!["street", "st"],
|
||||
"saint" => vec!["saint", "st"],
|
||||
"terr" => vec!["terr", "terrace"],
|
||||
"terrace" => vec!["terrace", "terr"],
|
||||
_ => Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn is_address_stop_token(token: &str) -> bool {
|
||||
matches!(
|
||||
token,
|
||||
"a" | "an"
|
||||
| "and"
|
||||
| "apartment"
|
||||
| "apt"
|
||||
| "avenue"
|
||||
| "ave"
|
||||
| "block"
|
||||
| "building"
|
||||
| "bungalow"
|
||||
| "close"
|
||||
| "cl"
|
||||
| "court"
|
||||
| "ct"
|
||||
| "cres"
|
||||
| "crescent"
|
||||
| "drive"
|
||||
| "dr"
|
||||
| "estate"
|
||||
| "flat"
|
||||
| "fl"
|
||||
| "floor"
|
||||
| "garden"
|
||||
| "gardens"
|
||||
| "gdns"
|
||||
| "grove"
|
||||
| "house"
|
||||
| "hse"
|
||||
| "lane"
|
||||
| "ln"
|
||||
| "lodge"
|
||||
| "mansions"
|
||||
| "mews"
|
||||
| "of"
|
||||
| "park"
|
||||
| "place"
|
||||
| "road"
|
||||
| "rd"
|
||||
| "room"
|
||||
| "row"
|
||||
| "saint"
|
||||
| "sq"
|
||||
| "square"
|
||||
| "st"
|
||||
| "street"
|
||||
| "terr"
|
||||
| "terrace"
|
||||
| "the"
|
||||
| "unit"
|
||||
| "view"
|
||||
| "villas"
|
||||
| "walk"
|
||||
| "way"
|
||||
| "yard"
|
||||
)
|
||||
}
|
||||
|
||||
fn address_term_group(token: &str) -> Option<AddressTermGroup> {
|
||||
if token.len() < 3 || is_numeric_address_token(token) || looks_like_postcode_fragment(token) {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut alternatives = Vec::new();
|
||||
alternatives.push(token.to_string());
|
||||
for alias in address_token_aliases(token) {
|
||||
if !alternatives.iter().any(|existing| existing == alias) {
|
||||
alternatives.push(alias.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
if alternatives
|
||||
.iter()
|
||||
.all(|alternative| is_address_stop_token(alternative))
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(AddressTermGroup { alternatives })
|
||||
}
|
||||
|
||||
fn address_search_tokens(text: &str) -> Vec<String> {
|
||||
let mut tokens: Vec<String> = tokenize_address_text(text)
|
||||
.into_iter()
|
||||
.filter(|token| is_address_search_token(token))
|
||||
.collect();
|
||||
tokens.sort_unstable();
|
||||
tokens.dedup();
|
||||
tokens
|
||||
}
|
||||
|
||||
fn is_address_search_token(token: &str) -> bool {
|
||||
if looks_like_postcode_fragment(token) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if is_numeric_address_token(token) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if token.chars().any(|ch| ch.is_ascii_digit()) {
|
||||
return token.len() >= 2;
|
||||
}
|
||||
|
||||
token.len() >= 3
|
||||
}
|
||||
|
||||
fn is_address_candidate_token(token: &str) -> bool {
|
||||
!is_numeric_address_token(token)
|
||||
&& !looks_like_postcode_fragment(token)
|
||||
&& (token.chars().any(|ch| ch.is_ascii_digit())
|
||||
|| (token.len() >= 3 && !is_address_stop_token(token)))
|
||||
}
|
||||
|
||||
fn address_prefix_key(term: &str) -> &str {
|
||||
if term.len() > ADDRESS_SEARCH_PREFIX_MAX_LEN {
|
||||
&term[..ADDRESS_SEARCH_PREFIX_MAX_LEN]
|
||||
} else {
|
||||
term
|
||||
}
|
||||
}
|
||||
|
||||
fn build_address_prefix_index(
|
||||
address_token_index: &FxHashMap<String, Vec<u32>>,
|
||||
) -> FxHashMap<String, Vec<String>> {
|
||||
let mut prefix_index: FxHashMap<String, Vec<String>> = FxHashMap::default();
|
||||
|
||||
for token in address_token_index.keys() {
|
||||
let max_prefix_len = token.len().min(ADDRESS_SEARCH_PREFIX_MAX_LEN);
|
||||
for prefix_len in ADDRESS_SEARCH_PREFIX_MIN_LEN..=max_prefix_len {
|
||||
prefix_index
|
||||
.entry(token[..prefix_len].to_string())
|
||||
.or_default()
|
||||
.push(token.clone());
|
||||
}
|
||||
}
|
||||
|
||||
for tokens in prefix_index.values_mut() {
|
||||
tokens.sort_unstable();
|
||||
tokens.dedup();
|
||||
}
|
||||
|
||||
prefix_index
|
||||
}
|
||||
|
||||
fn parse_address_query(query: &str) -> AddressQuery {
|
||||
let tokens = tokenize_address_text(query);
|
||||
let (full_postcode, postcode_token_indices) = extract_full_postcode(&tokens)
|
||||
.map(|(postcode, indices)| (Some(postcode), indices))
|
||||
.unwrap_or((None, Vec::new()));
|
||||
|
||||
let skip_postcode_tokens: FxHashSet<usize> = postcode_token_indices.into_iter().collect();
|
||||
let mut text_groups = Vec::new();
|
||||
let mut numeric_terms = Vec::new();
|
||||
let mut candidate_terms = Vec::new();
|
||||
|
||||
for (idx, token) in tokens.iter().enumerate() {
|
||||
if skip_postcode_tokens.contains(&idx) || looks_like_postcode_fragment(token) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if is_numeric_address_token(token) {
|
||||
numeric_terms.push(token.clone());
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(group) = address_term_group(token) {
|
||||
for alternative in &group.alternatives {
|
||||
if !is_address_stop_token(alternative)
|
||||
&& !candidate_terms.iter().any(|term| term == alternative)
|
||||
{
|
||||
candidate_terms.push(alternative.clone());
|
||||
}
|
||||
}
|
||||
text_groups.push(group);
|
||||
} else if token.chars().any(|ch| ch.is_ascii_digit()) && token.len() >= 2 {
|
||||
numeric_terms.push(token.clone());
|
||||
if !candidate_terms.iter().any(|term| term == token) {
|
||||
candidate_terms.push(token.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
text_groups.dedup_by(|left, right| left.alternatives == right.alternatives);
|
||||
numeric_terms.sort_unstable();
|
||||
numeric_terms.dedup();
|
||||
|
||||
AddressQuery {
|
||||
full_postcode,
|
||||
text_groups,
|
||||
numeric_terms,
|
||||
candidate_terms,
|
||||
}
|
||||
}
|
||||
|
||||
fn token_matches_query_term(token: &str, query_term: &str) -> bool {
|
||||
token == query_term || (query_term.len() >= 3 && token.starts_with(query_term))
|
||||
}
|
||||
|
||||
fn token_matches_numeric_term(token: &str, query_term: &str) -> bool {
|
||||
token == query_term || token.starts_with(query_term)
|
||||
}
|
||||
|
||||
fn address_tokens_match_group(tokens: &[String], group: &AddressTermGroup) -> bool {
|
||||
group.alternatives.iter().any(|alternative| {
|
||||
tokens
|
||||
.iter()
|
||||
.any(|token| token_matches_query_term(token, alternative))
|
||||
})
|
||||
}
|
||||
|
||||
/// Histogram with outlier buckets at the edges.
|
||||
/// - Bin 0: [min, p1) — low outliers
|
||||
/// - Bins 1 to n-2: [p1, p99) — main distribution, evenly divided
|
||||
|
|
@ -163,6 +522,20 @@ pub struct PropertyData {
|
|||
/// Interned postcodes: reader is thread-safe, keys index into it.
|
||||
postcode_interner: lasso::RodeoReader,
|
||||
postcode_keys: Vec<lasso::Spur>,
|
||||
/// Rows for each postcode, keyed by the interned postcode key.
|
||||
postcode_row_index: FxHashMap<lasso::Spur, Vec<u32>>,
|
||||
/// Inverted index from address tokens to property rows.
|
||||
address_token_index: FxHashMap<String, Vec<u32>>,
|
||||
/// Prefix lookup from typed address-token prefix to indexed full address tokens.
|
||||
address_prefix_index: FxHashMap<String, Vec<String>>,
|
||||
/// Interned normalized address-search tokens used for per-row scoring.
|
||||
address_search_interner: lasso::RodeoReader,
|
||||
/// Flat per-row normalized address-search token keys.
|
||||
address_search_token_keys: Vec<lasso::Spur>,
|
||||
/// Offset into `address_search_token_keys` for each row.
|
||||
address_search_token_offsets: Vec<u32>,
|
||||
/// Number of normalized address-search token keys for each row.
|
||||
address_search_token_lengths: Vec<u16>,
|
||||
/// For enum features: maps feature index to list of possible string values.
|
||||
/// Index in values list corresponds to the u16 value stored in feature_data.
|
||||
pub enum_values: rustc_hash::FxHashMap<usize, Vec<String>>,
|
||||
|
|
@ -197,6 +570,164 @@ impl PropertyData {
|
|||
(&self.postcode_interner, &self.postcode_keys)
|
||||
}
|
||||
|
||||
fn row_address_search_tokens(&self, row: usize) -> &[lasso::Spur] {
|
||||
let offset = self.address_search_token_offsets[row] as usize;
|
||||
let length = self.address_search_token_lengths[row] as usize;
|
||||
&self.address_search_token_keys[offset..offset + length]
|
||||
}
|
||||
|
||||
/// Search individual property addresses. Full postcode queries use a direct row index;
|
||||
/// free-text queries use a small inverted index over distinctive address tokens.
|
||||
pub fn search_addresses(&self, query: &str, limit: usize) -> Vec<usize> {
|
||||
if limit == 0 {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let parsed = parse_address_query(query);
|
||||
if parsed.full_postcode.is_none()
|
||||
&& parsed.text_groups.is_empty()
|
||||
&& parsed.numeric_terms.is_empty()
|
||||
{
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let candidate_rows: Vec<u32> = if let Some(postcode) = parsed.full_postcode.as_deref() {
|
||||
self.postcode_interner
|
||||
.get(postcode)
|
||||
.and_then(|key| self.postcode_row_index.get(&key))
|
||||
.map(|rows| rows.to_vec())
|
||||
.unwrap_or_default()
|
||||
} else if let Some(rows) = self.best_address_token_rows(&parsed.candidate_terms) {
|
||||
rows.iter()
|
||||
.take(ADDRESS_SEARCH_CANDIDATE_LIMIT)
|
||||
.copied()
|
||||
.collect()
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
if candidate_rows.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let mut scored: Vec<(i32, usize, usize)> = candidate_rows
|
||||
.into_iter()
|
||||
.filter_map(|row| {
|
||||
let row = row as usize;
|
||||
self.address_match_score(row, &parsed)
|
||||
.map(|score| (score, self.address(row).len(), row))
|
||||
})
|
||||
.collect();
|
||||
|
||||
scored.sort_unstable_by(|left, right| {
|
||||
right
|
||||
.0
|
||||
.cmp(&left.0)
|
||||
.then(left.1.cmp(&right.1))
|
||||
.then(left.2.cmp(&right.2))
|
||||
});
|
||||
|
||||
let mut seen = FxHashSet::default();
|
||||
let mut results = Vec::with_capacity(limit);
|
||||
for (_, _, row) in scored {
|
||||
let address = self.address(row).trim();
|
||||
if address.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let key = format!("{}\n{}", address.to_ascii_lowercase(), self.postcode(row));
|
||||
if !seen.insert(key) {
|
||||
continue;
|
||||
}
|
||||
results.push(row);
|
||||
if results.len() == limit {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
results
|
||||
}
|
||||
|
||||
fn best_address_token_rows(&self, terms: &[String]) -> Option<&[u32]> {
|
||||
let mut best: Option<&[u32]> = None;
|
||||
|
||||
for term in terms {
|
||||
if let Some(rows) = self.address_token_index.get(term) {
|
||||
if best.map_or(true, |current| rows.len() < current.len()) {
|
||||
best = Some(rows.as_slice());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if term.len() < 4 {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(tokens) = self.address_prefix_index.get(address_prefix_key(term)) {
|
||||
for token in tokens {
|
||||
if !token.starts_with(term) {
|
||||
continue;
|
||||
}
|
||||
if let Some(rows) = self.address_token_index.get(token) {
|
||||
if best.map_or(true, |current| rows.len() < current.len()) {
|
||||
best = Some(rows.as_slice());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
best
|
||||
}
|
||||
|
||||
fn address_match_score(&self, row: usize, parsed: &AddressQuery) -> Option<i32> {
|
||||
if self.address(row).trim().is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let tokens = self.row_address_search_tokens(row);
|
||||
if parsed
|
||||
.text_groups
|
||||
.iter()
|
||||
.any(|group| !self.address_tokens_match_group(tokens, group))
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
let numeric_matches = parsed
|
||||
.numeric_terms
|
||||
.iter()
|
||||
.filter(|term| {
|
||||
tokens.iter().any(|token| {
|
||||
token_matches_numeric_term(self.address_search_interner.resolve(token), term)
|
||||
})
|
||||
})
|
||||
.count();
|
||||
|
||||
if !parsed.numeric_terms.is_empty() && numeric_matches == 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut score = 0;
|
||||
if parsed.full_postcode.is_some() {
|
||||
score += 1_000;
|
||||
}
|
||||
score += (parsed.text_groups.len() as i32) * 200;
|
||||
score += (numeric_matches as i32) * 90;
|
||||
if numeric_matches == parsed.numeric_terms.len() && numeric_matches > 0 {
|
||||
score += 50;
|
||||
}
|
||||
|
||||
Some(score)
|
||||
}
|
||||
|
||||
fn address_tokens_match_group(&self, tokens: &[lasso::Spur], group: &AddressTermGroup) -> bool {
|
||||
group.alternatives.iter().any(|alternative| {
|
||||
tokens.iter().any(|token| {
|
||||
token_matches_query_term(self.address_search_interner.resolve(token), alternative)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the is_approx_build_date flag for a given row (bit-packed).
|
||||
pub fn is_approx_build_date(&self, row: usize) -> bool {
|
||||
let byte = self.approx_build_date_bits[row / 8];
|
||||
|
|
@ -946,27 +1477,70 @@ impl PropertyData {
|
|||
.map(|&perm_index| lon[perm_index as usize])
|
||||
.collect();
|
||||
|
||||
// Build contiguous address buffer (permuted)
|
||||
// Build contiguous address buffer and address search index (permuted)
|
||||
tracing::info!("Building interned strings");
|
||||
let total_addr_bytes: usize = address_raw.iter().map(|text| text.len()).sum();
|
||||
let mut address_buffer = String::with_capacity(total_addr_bytes);
|
||||
let mut address_offsets = Vec::with_capacity(row_count);
|
||||
let mut address_lengths = Vec::with_capacity(row_count);
|
||||
for &perm_index in &perm {
|
||||
let mut address_token_index: FxHashMap<String, Vec<u32>> = FxHashMap::default();
|
||||
let mut address_search_rodeo = lasso::Rodeo::default();
|
||||
let mut address_search_token_keys: Vec<lasso::Spur> = Vec::new();
|
||||
let mut address_search_token_offsets = Vec::with_capacity(row_count);
|
||||
let mut address_search_token_lengths = Vec::with_capacity(row_count);
|
||||
for (new_row, &perm_index) in perm.iter().enumerate() {
|
||||
let addr = &address_raw[perm_index as usize];
|
||||
let offset = address_buffer.len() as u32;
|
||||
let length = addr.len().min(u16::MAX as usize) as u16;
|
||||
address_offsets.push(offset);
|
||||
address_lengths.push(length);
|
||||
address_buffer.push_str(&addr[..length as usize]);
|
||||
|
||||
let search_tokens = address_search_tokens(addr);
|
||||
let token_offset = address_search_token_keys.len() as u32;
|
||||
let token_length = search_tokens.len().min(u16::MAX as usize) as u16;
|
||||
address_search_token_offsets.push(token_offset);
|
||||
address_search_token_lengths.push(token_length);
|
||||
|
||||
for token in search_tokens.iter().take(token_length as usize) {
|
||||
let key = address_search_rodeo.get_or_intern(token);
|
||||
address_search_token_keys.push(key);
|
||||
|
||||
if is_address_candidate_token(token) {
|
||||
address_token_index
|
||||
.entry(token.clone())
|
||||
.or_default()
|
||||
.push(new_row as u32);
|
||||
}
|
||||
}
|
||||
}
|
||||
let address_token_count_before_prune = address_token_index.len();
|
||||
address_token_index.retain(|_, rows| rows.len() <= ADDRESS_SEARCH_MAX_POSTINGS_PER_TOKEN);
|
||||
let address_prefix_index = build_address_prefix_index(&address_token_index);
|
||||
let address_search_interner = address_search_rodeo.into_reader();
|
||||
let address_postings_count: usize = address_token_index.values().map(Vec::len).sum();
|
||||
tracing::info!(
|
||||
tokens = address_token_index.len(),
|
||||
prefixes = address_prefix_index.len(),
|
||||
pruned_tokens =
|
||||
address_token_count_before_prune.saturating_sub(address_token_index.len()),
|
||||
postings = address_postings_count,
|
||||
row_tokens = address_search_token_keys.len(),
|
||||
"Address search index built"
|
||||
);
|
||||
|
||||
// Intern postcodes (permuted)
|
||||
let mut postcode_rodeo = lasso::Rodeo::default();
|
||||
let postcode_keys: Vec<lasso::Spur> = perm
|
||||
.iter()
|
||||
.map(|&perm_index| postcode_rodeo.get_or_intern(&postcode_raw[perm_index as usize]))
|
||||
.collect();
|
||||
let mut postcode_keys: Vec<lasso::Spur> = Vec::with_capacity(row_count);
|
||||
let mut postcode_row_index: FxHashMap<lasso::Spur, Vec<u32>> = FxHashMap::default();
|
||||
for (new_row, &perm_index) in perm.iter().enumerate() {
|
||||
let key = postcode_rodeo.get_or_intern(&postcode_raw[perm_index as usize]);
|
||||
postcode_keys.push(key);
|
||||
postcode_row_index
|
||||
.entry(key)
|
||||
.or_default()
|
||||
.push(new_row as u32);
|
||||
}
|
||||
let postcode_interner = postcode_rodeo.into_reader();
|
||||
|
||||
// Pack is_approx_build_date into a bitvec (8 bools per byte)
|
||||
|
|
@ -1110,6 +1684,13 @@ impl PropertyData {
|
|||
address_lengths,
|
||||
postcode_interner,
|
||||
postcode_keys,
|
||||
postcode_row_index,
|
||||
address_token_index,
|
||||
address_prefix_index,
|
||||
address_search_interner,
|
||||
address_search_token_keys,
|
||||
address_search_token_offsets,
|
||||
address_search_token_lengths,
|
||||
enum_values,
|
||||
enum_counts,
|
||||
approx_build_date_bits,
|
||||
|
|
@ -1133,6 +1714,120 @@ mod tests {
|
|||
Bounds::Percentile { low, high }
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn full_postcode_detection_accepts_common_formats() {
|
||||
assert!(is_full_postcode_compact("SW1A1AA"));
|
||||
assert!(is_full_postcode_compact("E142DG"));
|
||||
assert!(is_full_postcode_compact("M11AE"));
|
||||
assert!(!is_full_postcode_compact("E14"));
|
||||
assert!(!is_full_postcode_compact("DOWNING"));
|
||||
assert!(!is_full_postcode_compact("10A"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn address_query_parsing_skips_postcodes_and_street_suffixes() {
|
||||
let parsed = parse_address_query("Flat 2, 10 Downing St, SW1A 2AA");
|
||||
|
||||
assert_eq!(parsed.full_postcode.as_deref(), Some("SW1A 2AA"));
|
||||
assert_eq!(
|
||||
parsed.numeric_terms,
|
||||
vec!["10".to_string(), "2".to_string()]
|
||||
);
|
||||
assert_eq!(parsed.candidate_terms, vec!["downing".to_string()]);
|
||||
assert_eq!(parsed.text_groups.len(), 1);
|
||||
assert_eq!(
|
||||
parsed.text_groups[0].alternatives,
|
||||
vec!["downing".to_string()]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn address_query_parsing_handles_compact_postcodes() {
|
||||
let parsed = parse_address_query("10 downing street sw1a1aa");
|
||||
|
||||
assert_eq!(parsed.full_postcode.as_deref(), Some("SW1A 1AA"));
|
||||
assert_eq!(parsed.numeric_terms, vec!["10".to_string()]);
|
||||
assert_eq!(parsed.candidate_terms, vec!["downing".to_string()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn address_query_parsing_keeps_partial_terms_for_row_matching() {
|
||||
let parsed = parse_address_query("settlers cour");
|
||||
|
||||
assert_eq!(parsed.full_postcode, None);
|
||||
assert_eq!(parsed.numeric_terms, Vec::<String>::new());
|
||||
assert_eq!(
|
||||
parsed.candidate_terms,
|
||||
vec!["settlers".to_string(), "cour".to_string()]
|
||||
);
|
||||
assert_eq!(parsed.text_groups.len(), 2);
|
||||
assert_eq!(
|
||||
parsed.text_groups[0].alternatives,
|
||||
vec!["settlers".to_string()]
|
||||
);
|
||||
assert_eq!(parsed.text_groups[1].alternatives, vec!["cour".to_string()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn address_search_tokens_keep_actual_address_terms_for_scoring() {
|
||||
let tokens = address_search_tokens("Flat 2, 10 Downing Cour");
|
||||
|
||||
assert_eq!(
|
||||
tokens,
|
||||
vec![
|
||||
"10".to_string(),
|
||||
"2".to_string(),
|
||||
"cour".to_string(),
|
||||
"downing".to_string(),
|
||||
"flat".to_string()
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn address_prefix_index_finds_partial_address_terms() {
|
||||
let mut token_index: FxHashMap<String, Vec<u32>> = FxHashMap::default();
|
||||
token_index.insert("downing".to_string(), vec![1]);
|
||||
token_index.insert("downton".to_string(), vec![2]);
|
||||
token_index.insert("market".to_string(), vec![3]);
|
||||
|
||||
let prefix_index = build_address_prefix_index(&token_index);
|
||||
|
||||
assert_eq!(
|
||||
prefix_index.get("down").cloned().unwrap_or_default(),
|
||||
vec!["downing".to_string(), "downton".to_string()]
|
||||
);
|
||||
assert_eq!(
|
||||
prefix_index.get("downi").cloned().unwrap_or_default(),
|
||||
vec!["downing".to_string()]
|
||||
);
|
||||
assert_eq!(
|
||||
prefix_index.get("downt").cloned().unwrap_or_default(),
|
||||
vec!["downton".to_string()]
|
||||
);
|
||||
assert!(!prefix_index.contains_key("do"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn address_term_matching_allows_prefixes_and_aliases() {
|
||||
let tokens = tokenize_address_text("10 Downing Street");
|
||||
let prefix_group = address_term_group("down").expect("prefix term should be searchable");
|
||||
let alias_group = AddressTermGroup {
|
||||
alternatives: vec!["st".to_string(), "street".to_string()],
|
||||
};
|
||||
|
||||
assert!(address_tokens_match_group(&tokens, &prefix_group));
|
||||
assert!(address_tokens_match_group(&tokens, &alias_group));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn address_term_matching_uses_actual_token_prefixes() {
|
||||
let tokens = tokenize_address_text("12 Settlers Court");
|
||||
let prefix_group = address_term_group("cou").expect("partial term should be searchable");
|
||||
|
||||
assert!(address_tokens_match_group(&tokens, &prefix_group));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_empty_data() {
|
||||
let data: Vec<f32> = vec![];
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue