This commit is contained in:
Andras Schmelczer 2026-06-02 13:46:18 +01:00
parent a04ac2d857
commit d43da9708c
47 changed files with 4120 additions and 573 deletions

View file

@ -120,7 +120,7 @@ impl CrimeByYearData {
.list()
.with_context(|| format!("Column '{col_name}' is not a list"))?;
for row in 0..row_count {
for (row, postcode) in postcode_values.iter().enumerate().take(row_count) {
let Some(inner) = list_ca.get_as_series(row) else {
continue;
};
@ -163,7 +163,7 @@ impl CrimeByYearData {
points.sort_by_key(|p| p.year);
series_by_postcode
.entry(postcode_values[row].clone())
.entry(postcode.clone())
.or_default()
.push(PostcodeCrimeSeries {
type_idx: type_idx as u16,

View file

@ -4,10 +4,16 @@ use anyhow::Context;
use polars::frame::DataFrame;
use polars::lazy::frame::LazyFrame;
use polars::prelude::*;
use rustc_hash::FxHashMap;
use tracing::info;
use crate::utils::InternedColumn;
/// Upper bound on place rows scored per query (candidate sets are normally far smaller).
const PLACE_CANDIDATE_LIMIT: usize = 50_000;
const PLACE_PREFIX_MIN_LEN: usize = 2;
const PLACE_PREFIX_MAX_LEN: usize = 6;
pub struct PlaceData {
pub name: Vec<String>,
pub name_lower: Vec<String>,
@ -19,6 +25,13 @@ pub struct PlaceData {
pub lon: Vec<f32>,
pub city: Vec<Option<String>>,
pub travel_destination: Vec<bool>,
/// Inverted index from an alias token to the (ascending) place rows containing it. Lets place
/// search gather candidates instead of scanning all ~1M+ rows per keystroke.
token_index: FxHashMap<String, Vec<u32>>,
/// Prefix → indexed tokens, for matching a partially-typed final word.
token_prefix_index: FxHashMap<String, Vec<String>>,
/// Trigram → fuzzy-eligible rows (settlements/stations only), for bounded typo matching.
fuzzy_trigram_index: FxHashMap<u32, Vec<u32>>,
}
#[derive(Clone, Copy)]
@ -168,6 +181,148 @@ pub fn normalize_search_text(text: &str) -> String {
result
}
/// Tokens across all of a place's search aliases (split on word and alias separators),
/// for token-AND matching where every query word must prefix-match some place token.
pub fn place_alias_tokens(search_text: &str) -> impl Iterator<Item = &str> {
search_text
.split([' ', '|'])
.filter(|token| !token.is_empty())
}
fn trigram_hash(first: char, second: char, third: char) -> u32 {
let mut hash = 2_166_136_261u32;
for ch in [first, second, third] {
hash = (hash ^ (ch as u32)).wrapping_mul(16_777_619);
}
hash
}
/// Sorted, de-duplicated padded character trigrams of `text`, for Jaccard fuzzy matching.
pub fn compute_trigrams(text: &str) -> Vec<u32> {
let norm = normalize_search_text(text);
if norm.is_empty() {
return Vec::new();
}
let chars: Vec<char> = [' ', ' ']
.into_iter()
.chain(norm.chars())
.chain(std::iter::once(' '))
.collect();
let mut grams: Vec<u32> = chars
.windows(3)
.map(|window| trigram_hash(window[0], window[1], window[2]))
.collect();
grams.sort_unstable();
grams.dedup();
grams
}
/// Intersect two ascending-sorted row-id slices.
fn intersect_sorted(left: &[u32], right: &[u32]) -> Vec<u32> {
let mut out = Vec::new();
let (mut i, mut j) = (0, 0);
while i < left.len() && j < right.len() {
match left[i].cmp(&right[j]) {
std::cmp::Ordering::Less => i += 1,
std::cmp::Ordering::Greater => j += 1,
std::cmp::Ordering::Equal => {
out.push(left[i]);
i += 1;
j += 1;
}
}
}
out
}
/// Union two ascending-sorted row-id slices (deduplicated, stays sorted).
fn union_sorted(left: &[u32], right: &[u32]) -> Vec<u32> {
let mut out = Vec::with_capacity(left.len() + right.len());
let (mut i, mut j) = (0, 0);
while i < left.len() && j < right.len() {
match left[i].cmp(&right[j]) {
std::cmp::Ordering::Less => {
out.push(left[i]);
i += 1;
}
std::cmp::Ordering::Greater => {
out.push(right[j]);
j += 1;
}
std::cmp::Ordering::Equal => {
out.push(left[i]);
i += 1;
j += 1;
}
}
}
out.extend_from_slice(&left[i..]);
out.extend_from_slice(&right[j..]);
out
}
/// Distinct indexable tokens (len ≥ 2) across all of a place's search aliases. ASCII because
/// `normalize_search_text` already dropped non-alphanumerics, so prefix byte-slicing is safe.
fn place_index_tokens(search_text: &str) -> Vec<String> {
let mut tokens: Vec<String> = place_alias_tokens(search_text)
.filter(|token| token.len() >= 2)
.map(ToString::to_string)
.collect();
tokens.sort_unstable();
tokens.dedup();
tokens
}
fn build_place_prefix_index(
token_index: &FxHashMap<String, Vec<u32>>,
) -> FxHashMap<String, Vec<String>> {
let mut prefix_index: FxHashMap<String, Vec<String>> = FxHashMap::default();
for token in token_index.keys() {
let max_len = token.len().min(PLACE_PREFIX_MAX_LEN);
for len in PLACE_PREFIX_MIN_LEN..=max_len {
prefix_index
.entry(token[..len].to_string())
.or_default()
.push(token.clone());
}
}
for tokens in prefix_index.values_mut() {
tokens.sort_unstable();
tokens.dedup();
}
prefix_index
}
/// Whether a place type participates in fuzzy (typo) matching. Settlements/stations/universities
/// do; the ~1M streets and POIs do not (people rarely misspell a road and it keeps fuzzy bounded).
fn is_fuzzy_eligible_type(place_type: &str) -> bool {
!matches!(
place_type,
"street" | "park" | "attraction" | "hospital" | "retail"
)
}
/// Jaccard similarity between two sorted trigram sets (0.01.0).
pub fn trigram_similarity(left: &[u32], right: &[u32]) -> f32 {
if left.is_empty() || right.is_empty() {
return 0.0;
}
let (mut i, mut j, mut intersection) = (0, 0, 0usize);
while i < left.len() && j < right.len() {
match left[i].cmp(&right[j]) {
std::cmp::Ordering::Less => i += 1,
std::cmp::Ordering::Greater => j += 1,
std::cmp::Ordering::Equal => {
intersection += 1;
i += 1;
j += 1;
}
}
}
let union = left.len() + right.len() - intersection;
intersection as f32 / union as f32
}
fn replace_token(text: &str, from: &str, to: &str) -> Option<String> {
let mut changed = false;
let replaced: Vec<&str> = text
@ -191,15 +346,31 @@ fn push_alias(aliases: &mut Vec<String>, alias: String) {
}
}
/// Bidirectional token abbreviations expanded into search aliases so a query typed either
/// way matches (e.g. "gt missenden" ↔ "Great Missenden", "mt" ↔ "Mount").
const PLACE_TOKEN_ALIASES: &[(&str, &str)] = &[
("st", "saint"),
("saint", "st"),
("mt", "mount"),
("mount", "mt"),
("gt", "great"),
("great", "gt"),
("lt", "little"),
("little", "lt"),
("upr", "upper"),
("upper", "upr"),
("lwr", "lower"),
("lower", "lwr"),
];
fn build_search_text(name: &str, place_type: &str) -> String {
let primary = normalize_search_text(name);
let mut aliases = vec![primary.clone()];
if let Some(alias) = replace_token(&primary, "st", "saint") {
push_alias(&mut aliases, alias);
}
if let Some(alias) = replace_token(&primary, "saint", "st") {
push_alias(&mut aliases, alias);
for (from, to) in PLACE_TOKEN_ALIASES {
if let Some(alias) = replace_token(&primary, from, to) {
push_alias(&mut aliases, alias);
}
}
if place_type == "station" {
@ -391,6 +562,26 @@ impl PlaceData {
fallback_city
};
// Build the place search index: an inverted token index over all rows (so the per-query
// cost scales with matched candidates, not the ~1M-row corpus), plus a trigram index over
// only fuzzy-eligible rows for bounded typo matching.
let mut token_index: FxHashMap<String, Vec<u32>> = FxHashMap::default();
let mut fuzzy_trigram_index: FxHashMap<u32, Vec<u32>> = FxHashMap::default();
for idx in 0..row_count {
for token in place_index_tokens(&name_search[idx]) {
token_index.entry(token).or_default().push(idx as u32);
}
if is_fuzzy_eligible_type(&place_type_raw[idx]) {
for trigram in compute_trigrams(&name[idx]) {
fuzzy_trigram_index
.entry(trigram)
.or_default()
.push(idx as u32);
}
}
}
let token_prefix_index = build_place_prefix_index(&token_index);
let with_pop = population.iter().filter(|&&pop| pop > 0).count();
let with_city = city.iter().filter(|c| c.is_some()).count();
info!(
@ -398,6 +589,8 @@ impl PlaceData {
types = place_type.values.len(),
with_population = with_pop,
with_city = with_city,
tokens = token_index.len(),
fuzzy_trigrams = fuzzy_trigram_index.len(),
"Place data loaded"
);
@ -412,14 +605,261 @@ impl PlaceData {
lon,
city,
travel_destination,
token_index,
token_prefix_index,
fuzzy_trigram_index,
})
}
/// Candidate place rows for the query content tokens: intersect the posting lists of words
/// typed in full; if none matched an indexed token exactly, seed from the smallest
/// prefix-expanded list (so a partially-typed final word still works). Bounded by
/// `PLACE_CANDIDATE_LIMIT`.
pub fn place_candidate_rows(&self, tokens: &[&str]) -> Vec<u32> {
let mut exact: Vec<&[u32]> = tokens
.iter()
.filter_map(|token| self.token_index.get(*token).map(Vec::as_slice))
.collect();
let mut rows = if exact.is_empty() {
self.place_prefix_seed(tokens)
} else {
exact.sort_by_key(|posting| posting.len());
let mut acc = exact[0].to_vec();
for posting in &exact[1..] {
if acc.is_empty() {
break;
}
acc = intersect_sorted(&acc, posting);
}
acc
};
rows.truncate(PLACE_CANDIDATE_LIMIT);
rows
}
fn place_prefix_seed(&self, tokens: &[&str]) -> Vec<u32> {
let mut best: Option<Vec<u32>> = None;
for token in tokens {
if token.len() < PLACE_PREFIX_MIN_LEN {
continue;
}
let key = &token[..token.len().min(PLACE_PREFIX_MAX_LEN)];
let Some(indexed) = self.token_prefix_index.get(key) else {
continue;
};
let mut union: Vec<u32> = Vec::new();
for indexed_token in indexed {
if !indexed_token.starts_with(token) {
continue;
}
if let Some(rows) = self.token_index.get(indexed_token) {
union = if union.is_empty() {
rows.clone()
} else {
union_sorted(&union, rows)
};
}
}
if !union.is_empty()
&& best
.as_ref()
.is_none_or(|current| union.len() < current.len())
{
best = Some(union);
}
}
best.unwrap_or_default()
}
/// Fuzzy-eligible rows sharing enough trigrams with the query to be worth Jaccard scoring.
/// Bounded by the (small) fuzzy trigram index rather than scanning every place.
pub fn fuzzy_candidate_rows(&self, query_trigrams: &[u32]) -> Vec<u32> {
if query_trigrams.is_empty() {
return Vec::new();
}
let mut counts: FxHashMap<u32, u16> = FxHashMap::default();
for trigram in query_trigrams {
if let Some(rows) = self.fuzzy_trigram_index.get(trigram) {
for &row in rows {
*counts.entry(row).or_default() += 1;
}
}
}
let min_shared = (((query_trigrams.len() as f32) * 0.4).ceil() as u16).max(1);
counts
.into_iter()
.filter_map(|(row, shared)| (shared >= min_shared).then_some(row))
.collect()
}
}
#[cfg(test)]
impl PlaceData {
/// Build a minimal PlaceData from (name, place_type) pairs for index tests.
fn from_names<S: AsRef<str>>(rows: &[(S, S)]) -> Self {
let name: Vec<String> = rows.iter().map(|(nm, _)| nm.as_ref().to_string()).collect();
let place_type_raw: Vec<String> =
rows.iter().map(|(_, pt)| pt.as_ref().to_string()).collect();
let name_lower: Vec<String> = name.iter().map(|nm| nm.to_lowercase()).collect();
let name_search: Vec<String> = name
.iter()
.zip(&place_type_raw)
.map(|(nm, pt)| build_search_text(nm, pt))
.collect();
let mut token_index: FxHashMap<String, Vec<u32>> = FxHashMap::default();
let mut fuzzy_trigram_index: FxHashMap<u32, Vec<u32>> = FxHashMap::default();
for idx in 0..name.len() {
for token in place_index_tokens(&name_search[idx]) {
token_index.entry(token).or_default().push(idx as u32);
}
if is_fuzzy_eligible_type(&place_type_raw[idx]) {
for trigram in compute_trigrams(&name[idx]) {
fuzzy_trigram_index
.entry(trigram)
.or_default()
.push(idx as u32);
}
}
}
let token_prefix_index = build_place_prefix_index(&token_index);
let len = name.len();
PlaceData {
name,
name_lower,
name_search,
place_type: InternedColumn::build(&place_type_raw),
type_rank: place_type_raw.iter().map(|pt| type_rank(pt)).collect(),
population: vec![0; len],
lat: vec![0.0; len],
lon: vec![0.0; len],
city: vec![None; len],
travel_destination: vec![false; len],
token_index,
token_prefix_index,
fuzzy_trigram_index,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn place_index_tokens_dedup_and_min_length() {
// "a" is too short; aliases split on " | ".
assert_eq!(
place_index_tokens("st albans | saint albans"),
vec!["albans".to_string(), "saint".to_string(), "st".to_string()]
);
}
#[test]
fn place_candidate_rows_intersect_and_prefix_seed() {
let pd = PlaceData::from_names(&[
("Camden", "suburb"),
("Camden Town", "suburb"),
("Camden Market", "attraction"),
("Manchester", "city"),
("Manchester Piccadilly", "station"),
]);
// Full word → posting list (Camden, Camden Town, Camden Market).
let camden = pd.place_candidate_rows(&["camden"]);
assert_eq!(camden, vec![0, 1, 2]);
// Two full words intersect to rows containing BOTH (Camden Town only).
let camden_town = pd.place_candidate_rows(&["camden", "town"]);
assert_eq!(camden_town, vec![1]);
// A partially-typed final word with no exact token seeds from the prefix index.
let piccad = pd.place_candidate_rows(&["piccad"]);
assert_eq!(piccad, vec![4]);
// No match → empty.
assert!(pd.place_candidate_rows(&["zzzz"]).is_empty());
}
// Run with: cargo test --release bench_place_search -- --ignored --nocapture
#[test]
#[ignore]
fn bench_place_search_at_one_million_rows() {
let roads = [
"High Street",
"Station Road",
"Church Lane",
"Victoria Road",
"Mill Lane",
"Park Avenue",
"Queens Road",
"Kings Road",
];
let mut rows: Vec<(String, String)> = Vec::with_capacity(1_000_000);
for i in 0..1_000_000usize {
// Vary the name so the index resembles ~1M distinct (street, area) rows.
rows.push((
format!("{} {}", roads[i % roads.len()], i % 4000),
"street".into(),
));
}
rows.push(("London".into(), "city".into()));
let pd = PlaceData::from_names(&rows);
let start = std::time::Instant::now();
let mut hits = 0usize;
for _ in 0..50 {
let candidates = pd.place_candidate_rows(&["high", "street"]);
for row in candidates {
let idx = row as usize;
if place_search_test_score(&pd, idx, "high street", &["high", "street"]).is_some() {
hits += 1;
}
}
}
let per_query = start.elapsed() / 50;
println!(
"indexed place search over {} rows: {:?}/query ({} hits)",
pd.name.len(),
per_query,
hits / 50
);
// The old full O(N) scan measured ~36ms here; candidate-based must be far under that.
assert!(per_query.as_millis() < 10, "per_query was {per_query:?}");
}
/// Mirrors the route's per-candidate match check for the bench.
fn place_search_test_score(
pd: &PlaceData,
idx: usize,
query_search: &str,
query_tokens: &[&str],
) -> Option<f32> {
let search_text = &pd.name_search[idx];
if query_tokens.iter().all(|qt| {
place_alias_tokens(search_text)
.any(|t| t == *qt || (qt.len() >= 2 && t.starts_with(qt)))
}) {
Some(640.0)
} else if pd.name_lower[idx] == query_search {
Some(1000.0)
} else {
None
}
}
#[test]
fn fuzzy_candidate_rows_finds_typos_only_for_eligible_rows() {
let pd = PlaceData::from_names(&[
("London", "city"),
("Baker Street", "street"), // not fuzzy-eligible
]);
let typo = compute_trigrams("Londn");
let candidates = pd.fuzzy_candidate_rows(&typo);
assert!(candidates.contains(&0)); // London (city) is reachable by fuzzy
assert!(!candidates.contains(&1)); // streets are excluded from the fuzzy index
}
fn test_city_rows() -> [(&'static str, f32, f32, u32); 5] {
[
("London", 51.507_446, -0.1277653, 8_908_083),
@ -470,6 +910,29 @@ mod tests {
assert!(build_search_text("Shadwell DLR station", "station").contains("shadwell station"));
}
#[test]
fn search_text_expands_directional_and_size_abbreviations() {
assert!(build_search_text("Great Missenden", "village").contains("gt missenden"));
assert!(build_search_text("Mount Pleasant", "suburb").contains("mt pleasant"));
assert!(build_search_text("Little Venice", "suburb").contains("lt venice"));
}
#[test]
fn trigram_similarity_is_high_for_typos_and_low_for_unrelated() {
let london = compute_trigrams("London");
let typo = compute_trigrams("Londn");
let other = compute_trigrams("Manchester");
assert!(trigram_similarity(&london, &typo) >= 0.4);
assert!(trigram_similarity(&london, &other) < 0.2);
assert!((trigram_similarity(&london, &london) - 1.0).abs() < 1e-6);
}
#[test]
fn place_alias_tokens_split_across_aliases() {
let tokens: Vec<&str> = place_alias_tokens("kings cross | kings x").collect();
assert_eq!(tokens, vec!["kings", "cross", "kings", "x"]);
}
#[test]
fn travel_destination_types_match_legacy_places() {
assert!(is_travel_destination_type("city"));

View file

@ -398,7 +398,7 @@ fn build_school_meta(
let mut idx = vec![u32::MAX; row_count];
let mut meta = Vec::new();
for row in 0..row_count {
for (row, meta_idx) in idx.iter_mut().enumerate().take(row_count) {
let type_group_val = fetch_str(&type_group, row);
let type_val = fetch_str(&r#type, row);
// type_group is present for every GIAS row, so use it as the sentinel
@ -406,7 +406,7 @@ fn build_school_meta(
if type_group_val.is_none() && type_val.is_none() {
continue;
}
idx[row] = meta.len() as u32;
*meta_idx = meta.len() as u32;
meta.push(SchoolMetadata {
phase: fetch_str(&phase, row),
r#type: type_val,

View file

@ -10,8 +10,10 @@ use rustc_hash::{FxHashMap, FxHashSet};
use crate::consts::{H3_PRECOMPUTE_MAX, HISTOGRAM_BINS, NAN_U16, QUANT_SCALE};
use crate::features::{self, Bounds};
const ADDRESS_SEARCH_CANDIDATE_LIMIT: usize = 50_000;
const ADDRESS_SEARCH_MAX_POSTINGS_PER_TOKEN: usize = 250_000;
/// Upper bound on rows scored per query. Intersection keeps most candidate sets far below
/// this; only a single very common road word (e.g. "high") approaches it, and the in-area
/// priority sort keeps a refined query's matches ahead of the cut.
const ADDRESS_SEARCH_CANDIDATE_LIMIT: usize = 150_000;
const ADDRESS_SEARCH_PREFIX_MIN_LEN: usize = 4;
const ADDRESS_SEARCH_PREFIX_MAX_LEN: usize = 8;
const NO_POI_METRIC_ROW: u32 = u32::MAX;
@ -162,6 +164,11 @@ struct AddressTermGroup {
#[derive(Debug)]
struct AddressQuery {
full_postcode: Option<String>,
/// Compact uppercase outward code (optionally with a sector digit) recovered when the
/// user appended a partial postcode like "NW1" or "NW1 6". Used as an additive ranking
/// bias, never as a hard filter — so the disambiguating hint is honoured without
/// excluding the same road in other areas.
postcode_area: Option<String>,
text_groups: Vec<AddressTermGroup>,
numeric_terms: Vec<String>,
candidate_terms: Vec<String>,
@ -442,6 +449,138 @@ fn build_address_prefix_index(
prefix_index
}
/// Intersect two ascending-sorted row-id slices.
fn intersect_sorted(left: &[u32], right: &[u32]) -> Vec<u32> {
let mut out = Vec::new();
let (mut i, mut j) = (0, 0);
while i < left.len() && j < right.len() {
match left[i].cmp(&right[j]) {
std::cmp::Ordering::Less => i += 1,
std::cmp::Ordering::Greater => j += 1,
std::cmp::Ordering::Equal => {
out.push(left[i]);
i += 1;
j += 1;
}
}
}
out
}
/// Union two ascending-sorted row-id slices (deduplicated, stays sorted).
fn union_sorted(left: &[u32], right: &[u32]) -> Vec<u32> {
let mut out = Vec::with_capacity(left.len() + right.len());
let (mut i, mut j) = (0, 0);
while i < left.len() && j < right.len() {
match left[i].cmp(&right[j]) {
std::cmp::Ordering::Less => {
out.push(left[i]);
i += 1;
}
std::cmp::Ordering::Greater => {
out.push(right[j]);
j += 1;
}
std::cmp::Ordering::Equal => {
out.push(left[i]);
i += 1;
j += 1;
}
}
}
out.extend_from_slice(&left[i..]);
out.extend_from_slice(&right[j..]);
out
}
/// An ordinal like "1st", "2nd", "3rd", "21st" — part of the street name ("2nd Avenue"), not a
/// house-number prefix.
fn is_ordinal_token(token: &str) -> bool {
let split = token.len().saturating_sub(2);
let (digits, suffix) = token.split_at(split);
!digits.is_empty()
&& digits.chars().all(|ch| ch.is_ascii_digit())
&& matches!(suffix, "st" | "nd" | "rd" | "th")
}
/// Leading address tokens that denote a unit/house number rather than the street itself.
fn is_house_prefix_token(token: &str) -> bool {
if is_ordinal_token(token) {
return false;
}
matches!(
token,
"flat" | "fl" | "apartment" | "apt" | "unit" | "no" | "block" | "floor" | "room"
) || token.len() == 1
|| token.chars().all(|ch| ch.is_ascii_digit())
|| (token.chars().next().is_some_and(|ch| ch.is_ascii_digit())
&& token.chars().any(|ch| ch.is_ascii_alphabetic()))
}
/// Street-level key for an address: drops the leading house-number / flat prefix so that
/// "12 Baker Street" and "5 Baker Street" collapse to a single street entry.
fn street_key(address: &str) -> String {
let tokens = tokenize_address_text(address);
let mut start = 0;
while start < tokens.len() && is_house_prefix_token(&tokens[start]) {
start += 1;
}
if start >= tokens.len() {
return tokens.join(" ");
}
tokens[start..].join(" ")
}
/// Road-type words. Their presence (with no house number) marks a road browse, which we
/// collapse to one result per street.
const ROAD_TYPE_TOKENS: &[&str] = &[
"street",
"st",
"road",
"rd",
"lane",
"ln",
"avenue",
"ave",
"close",
"cl",
"drive",
"dr",
"way",
"court",
"ct",
"crescent",
"cres",
"place",
"terrace",
"terr",
"grove",
"gardens",
"gdns",
"walk",
"row",
"square",
"sq",
"hill",
"parade",
"mews",
"embankment",
"broadway",
"boulevard",
"blvd",
];
fn query_has_road_type(query: &str) -> bool {
tokenize_address_text(query)
.iter()
.any(|token| ROAD_TYPE_TOKENS.contains(&token.as_str()))
}
/// The outward code (everything before the space) of a canonical postcode.
fn outcode_of(postcode: &str) -> &str {
postcode.split(' ').next().unwrap_or(postcode)
}
fn parse_address_query(query: &str) -> AddressQuery {
let tokens = tokenize_address_text(query);
let (full_postcode, postcode_token_indices) = extract_full_postcode(&tokens)
@ -449,12 +588,45 @@ fn parse_address_query(query: &str) -> AddressQuery {
.unwrap_or((None, Vec::new()));
let skip_postcode_tokens: FxHashSet<usize> = postcode_token_indices.into_iter().collect();
// Recover an appended partial postcode (outcode, or outcode + sector digit) as a ranking
// bias rather than discarding it — but only from the TRAILING position, so a leading road
// designation like "A4 Great West Road" is not mistaken for an area refinement.
let mut postcode_area: Option<String> = None;
let mut consumed_partial_tokens: FxHashSet<usize> = FxHashSet::default();
if full_postcode.is_none() && !tokens.is_empty() {
let last = tokens.len() - 1;
if !skip_postcode_tokens.contains(&last) {
let sector_digit =
tokens[last].len() == 1 && tokens[last].chars().all(|ch| ch.is_ascii_digit());
if last >= 1
&& sector_digit
&& !skip_postcode_tokens.contains(&(last - 1))
&& looks_like_postcode_fragment(&tokens[last - 1])
{
postcode_area = Some(format!(
"{}{}",
tokens[last - 1].to_ascii_uppercase(),
tokens[last]
));
consumed_partial_tokens.insert(last);
consumed_partial_tokens.insert(last - 1);
} else if looks_like_postcode_fragment(&tokens[last]) {
postcode_area = Some(tokens[last].to_ascii_uppercase());
consumed_partial_tokens.insert(last);
}
}
}
let mut text_groups = Vec::new();
let mut numeric_terms = Vec::new();
let mut candidate_terms = Vec::new();
for (idx, token) in tokens.iter().enumerate() {
if skip_postcode_tokens.contains(&idx) || looks_like_postcode_fragment(token) {
if skip_postcode_tokens.contains(&idx)
|| consumed_partial_tokens.contains(&idx)
|| looks_like_postcode_fragment(token)
{
continue;
}
@ -486,6 +658,7 @@ fn parse_address_query(query: &str) -> AddressQuery {
AddressQuery {
full_postcode,
postcode_area,
text_groups,
numeric_terms,
candidate_terms,
@ -897,9 +1070,15 @@ impl PropertyData {
&self.address_search_token_keys[offset..offset + length]
}
/// Search individual property addresses. Full postcode queries use a direct row index;
/// free-text queries use a small inverted index over distinctive address tokens.
pub fn search_addresses(&self, query: &str, limit: usize) -> Vec<usize> {
/// Search individual property addresses, returning `(row, score)` ranked best-first.
///
/// Candidate rows come from intersecting the posting lists of the distinctive words the
/// user typed in full (so "Cherry Hinton Road" narrows to rows containing both), unioned
/// with the exact-postcode rows when a complete postcode is present (so a postcode is a
/// boost, not an all-or-nothing gate). An appended partial postcode keeps in-area rows
/// ahead of the candidate cut and adds a scoring bias. With a road-type word and no house
/// number, results collapse to one row per street.
pub fn search_addresses(&self, query: &str, limit: usize) -> Vec<(usize, i32)> {
if limit == 0 {
return Vec::new();
}
@ -912,25 +1091,45 @@ impl PropertyData {
return Vec::new();
}
let candidate_rows: Vec<u32> = if let Some(postcode) = parsed.full_postcode.as_deref() {
self.postcode_interner
let mut candidate_rows = self.address_candidate_rows(&parsed.candidate_terms);
// A complete postcode contributes its rows too, instead of replacing the road match.
if let Some(postcode) = parsed.full_postcode.as_deref() {
if let Some(rows) = self
.postcode_interner
.get(postcode)
.and_then(|key| self.postcode_row_index.get(&key))
.map(|rows| rows.to_vec())
.unwrap_or_default()
} else if let Some(rows) = self.best_address_token_rows(&parsed.candidate_terms) {
rows.iter()
.take(ADDRESS_SEARCH_CANDIDATE_LIMIT)
.copied()
.collect()
} else {
Vec::new()
};
{
candidate_rows = if candidate_rows.is_empty() {
rows.clone()
} else {
union_sorted(&candidate_rows, rows)
};
}
}
if candidate_rows.is_empty() {
return Vec::new();
}
// When the user appended a partial postcode, keep in-area rows ahead of the cut so the
// refinement still surfaces even for very common roads. Single pass (stable partition) so
// the postcode check — which allocates — runs exactly once per candidate.
if let Some(area) = parsed.postcode_area.as_deref() {
let mut in_area = Vec::new();
let mut others = Vec::new();
for &row in &candidate_rows {
if self.row_postcode_in_area(row as usize, area) {
in_area.push(row);
} else {
others.push(row);
}
}
in_area.extend(others);
candidate_rows = in_area;
}
candidate_rows.truncate(ADDRESS_SEARCH_CANDIDATE_LIMIT);
let mut scored: Vec<(i32, usize, usize)> = candidate_rows
.into_iter()
.filter_map(|row| {
@ -948,18 +1147,29 @@ impl PropertyData {
.then(left.2.cmp(&right.2))
});
// Collapse a road browse (road-type word, no house number) to one row per street.
let collapse_streets = parsed.numeric_terms.is_empty() && query_has_road_type(query);
let mut seen = FxHashSet::default();
let mut results = Vec::with_capacity(limit);
for (_, _, row) in scored {
for (score, _, row) in scored {
let address = self.address(row).trim();
if address.is_empty() {
continue;
}
let key = format!("{}\n{}", address.to_ascii_lowercase(), self.postcode(row));
let key = if collapse_streets {
format!(
"{}\n{}",
street_key(address),
outcode_of(self.postcode(row))
)
} else {
format!("{}\n{}", address.to_ascii_lowercase(), self.postcode(row))
};
if !seen.insert(key) {
continue;
}
results.push(row);
results.push((row, score));
if results.len() == limit {
break;
}
@ -968,36 +1178,75 @@ impl PropertyData {
results
}
fn best_address_token_rows(&self, terms: &[String]) -> Option<&[u32]> {
let mut best: Option<&[u32]> = None;
for term in terms {
if let Some(rows) = self.address_token_index.get(term) {
if best.is_none_or(|current| rows.len() < current.len()) {
best = Some(rows.as_slice());
}
continue;
}
if term.len() < 4 {
continue;
}
if let Some(tokens) = self.address_prefix_index.get(address_prefix_key(term)) {
for token in tokens {
if !token.starts_with(term) {
continue;
}
if let Some(rows) = self.address_token_index.get(token) {
if best.is_none_or(|current| rows.len() < current.len()) {
best = Some(rows.as_slice());
}
}
}
/// True when the row's postcode begins with the compact partial-postcode `area`
/// (e.g. "NW1" or "NW16" matches "NW1 6XE").
fn row_postcode_in_area(&self, row: usize, area: &str) -> bool {
let mut compact = String::new();
for ch in self.postcode(row).chars() {
if !ch.is_whitespace() {
compact.push(ch.to_ascii_uppercase());
}
}
compact.starts_with(area)
}
best
/// Candidate rows for the distinctive query words. Words typed in full intersect by their
/// exact posting lists (precise); a still-being-typed final word with no exact match seeds
/// from the smallest prefix-expanded posting list (so partial typing keeps working).
fn address_candidate_rows(&self, terms: &[String]) -> Vec<u32> {
let mut exact: Vec<&[u32]> = terms
.iter()
.filter_map(|term| self.address_token_index.get(term).map(Vec::as_slice))
.collect();
if !exact.is_empty() {
exact.sort_by_key(|rows| rows.len());
let mut acc = exact[0].to_vec();
for rows in &exact[1..] {
if acc.is_empty() {
break;
}
acc = intersect_sorted(&acc, rows);
}
return acc;
}
self.prefix_seed_rows(terms)
}
/// Seed rows from the smallest prefix-expanded term — used only when no word matched an
/// indexed token exactly (i.e. the user is still typing the final word).
fn prefix_seed_rows(&self, terms: &[String]) -> Vec<u32> {
let mut best: Option<Vec<u32>> = None;
for term in terms {
if term.len() < ADDRESS_SEARCH_PREFIX_MIN_LEN {
continue;
}
let Some(tokens) = self.address_prefix_index.get(address_prefix_key(term)) else {
continue;
};
let mut union: Vec<u32> = Vec::new();
for token in tokens {
if !token.starts_with(term) {
continue;
}
if let Some(rows) = self.address_token_index.get(token) {
union = if union.is_empty() {
rows.clone()
} else {
union_sorted(&union, rows)
};
}
}
if !union.is_empty()
&& best
.as_ref()
.is_none_or(|current| union.len() < current.len())
{
best = Some(union);
}
}
best.unwrap_or_default()
}
fn address_match_score(&self, row: usize, parsed: &AddressQuery) -> Option<i32> {
@ -1037,6 +1286,12 @@ impl PropertyData {
if numeric_matches == parsed.numeric_terms.len() && numeric_matches > 0 {
score += 50;
}
// Additive bias (never a filter) when the row sits in the appended partial postcode.
if let Some(area) = parsed.postcode_area.as_deref() {
if self.row_postcode_in_area(row, area) {
score += 400;
}
}
Some(score)
}
@ -1969,16 +2224,23 @@ impl PropertyData {
}
}
}
let address_token_count_before_prune = address_token_index.len();
address_token_index.retain(|_, rows| rows.len() <= ADDRESS_SEARCH_MAX_POSTINGS_PER_TOKEN);
// Keep every distinctive token: common road words ("high", "church", "station") are
// exactly what people search, and dropping them made those roads unsearchable while a
// prefix fallback surfaced the wrong street ("Highbury" for "High"). The candidate scan
// is bounded per query instead (ADDRESS_SEARCH_CANDIDATE_LIMIT), and stop words are
// already excluded from the index, so the largest posting lists stay modest.
let max_postings = address_token_index
.values()
.map(Vec::len)
.max()
.unwrap_or(0);
let address_prefix_index = build_address_prefix_index(&address_token_index);
let address_search_interner = address_search_rodeo.into_reader();
let address_postings_count: usize = address_token_index.values().map(Vec::len).sum();
tracing::info!(
tokens = address_token_index.len(),
prefixes = address_prefix_index.len(),
pruned_tokens =
address_token_count_before_prune.saturating_sub(address_token_index.len()),
max_postings_per_token = max_postings,
postings = address_postings_count,
row_tokens = address_search_token_keys.len(),
"Address search index built"
@ -2340,6 +2602,79 @@ mod tests {
assert_eq!(parsed.candidate_terms, vec!["downing".to_string()]);
}
#[test]
fn address_query_recovers_appended_partial_postcode_as_bias() {
let parsed = parse_address_query("Baker Street NW1");
assert_eq!(parsed.full_postcode, None);
assert_eq!(parsed.postcode_area.as_deref(), Some("NW1"));
// The road words are still searchable; the postcode fragment did not consume them.
assert_eq!(parsed.candidate_terms, vec!["baker".to_string()]);
assert!(parsed.numeric_terms.is_empty());
}
#[test]
fn address_query_recovers_outcode_plus_sector_without_a_phantom_house_number() {
let parsed = parse_address_query("High Street CR0 2");
assert_eq!(parsed.postcode_area.as_deref(), Some("CR02"));
// The lone sector digit must not be treated as a house number.
assert!(parsed.numeric_terms.is_empty());
assert_eq!(parsed.candidate_terms, vec!["high".to_string()]);
}
#[test]
fn full_postcode_takes_precedence_over_partial_bias() {
let parsed = parse_address_query("Baker Street NW1 6XE");
assert_eq!(parsed.full_postcode.as_deref(), Some("NW1 6XE"));
assert_eq!(parsed.postcode_area, None);
}
#[test]
fn intersect_and_union_sorted_row_ids() {
assert_eq!(
intersect_sorted(&[1, 2, 3, 5], &[2, 3, 4, 5]),
vec![2, 3, 5]
);
assert_eq!(intersect_sorted(&[1, 2], &[3, 4]), Vec::<u32>::new());
assert_eq!(union_sorted(&[1, 3, 5], &[2, 3, 4]), vec![1, 2, 3, 4, 5]);
assert_eq!(union_sorted(&[], &[2, 4]), vec![2, 4]);
}
#[test]
fn street_key_collapses_house_numbers_and_flats() {
assert_eq!(street_key("12 Baker Street"), "baker street");
assert_eq!(street_key("5 Baker Street"), "baker street");
assert_eq!(street_key("Flat 2, 10 Downing Street"), "downing street");
assert_eq!(street_key("221B Baker Street"), "baker street");
}
#[test]
fn street_key_keeps_ordinal_street_names() {
// Ordinals are part of the street name, not a house-number prefix.
assert_eq!(street_key("2nd Avenue"), "2nd avenue");
assert_eq!(street_key("12 3rd Avenue"), "3rd avenue");
assert!(is_ordinal_token("21st"));
assert!(!is_ordinal_token("21"));
assert!(!is_ordinal_token("221b"));
}
#[test]
fn postcode_area_recovered_only_from_the_trailing_position() {
// A leading road designation must NOT be taken as an area refinement.
let parsed = parse_address_query("A4 Great West Road");
assert_eq!(parsed.postcode_area, None);
// A genuine trailing outcode still is.
let trailing = parse_address_query("Great West Road W4");
assert_eq!(trailing.postcode_area.as_deref(), Some("W4"));
}
#[test]
fn road_type_detection() {
assert!(query_has_road_type("high street"));
assert!(query_has_road_type("acacia avenue"));
assert!(!query_has_road_type("acacia"));
assert!(!query_has_road_type("london"));
}
#[test]
fn address_query_parsing_keeps_partial_terms_for_row_matching() {
let parsed = parse_address_query("settlers cour");