This commit is contained in:
Andras Schmelczer 2026-06-02 13:46:18 +01:00
parent a04ac2d857
commit d43da9708c
47 changed files with 4120 additions and 573 deletions

View file

@ -37,7 +37,9 @@ where
pub use actual_listings::{ActualListing, ActualListingData};
pub use crime_by_year::CrimeByYearData;
pub use places::{normalize_search_text, PlaceData};
pub use places::{
compute_trigrams, normalize_search_text, place_alias_tokens, trigram_similarity, PlaceData,
};
pub use poi::{resolve_poi_category_filter, POICategoryGroup, POIData, SchoolMetadata};
pub use postcodes::{OutcodeData, PostcodeData};
pub use property::{

View file

@ -120,7 +120,7 @@ impl CrimeByYearData {
.list()
.with_context(|| format!("Column '{col_name}' is not a list"))?;
for row in 0..row_count {
for (row, postcode) in postcode_values.iter().enumerate().take(row_count) {
let Some(inner) = list_ca.get_as_series(row) else {
continue;
};
@ -163,7 +163,7 @@ impl CrimeByYearData {
points.sort_by_key(|p| p.year);
series_by_postcode
.entry(postcode_values[row].clone())
.entry(postcode.clone())
.or_default()
.push(PostcodeCrimeSeries {
type_idx: type_idx as u16,

View file

@ -4,10 +4,16 @@ use anyhow::Context;
use polars::frame::DataFrame;
use polars::lazy::frame::LazyFrame;
use polars::prelude::*;
use rustc_hash::FxHashMap;
use tracing::info;
use crate::utils::InternedColumn;
/// Upper bound on place rows scored per query (candidate sets are normally far smaller).
const PLACE_CANDIDATE_LIMIT: usize = 50_000;
const PLACE_PREFIX_MIN_LEN: usize = 2;
const PLACE_PREFIX_MAX_LEN: usize = 6;
pub struct PlaceData {
pub name: Vec<String>,
pub name_lower: Vec<String>,
@ -19,6 +25,13 @@ pub struct PlaceData {
pub lon: Vec<f32>,
pub city: Vec<Option<String>>,
pub travel_destination: Vec<bool>,
/// Inverted index from an alias token to the (ascending) place rows containing it. Lets place
/// search gather candidates instead of scanning all ~1M+ rows per keystroke.
token_index: FxHashMap<String, Vec<u32>>,
/// Prefix → indexed tokens, for matching a partially-typed final word.
token_prefix_index: FxHashMap<String, Vec<String>>,
/// Trigram → fuzzy-eligible rows (settlements/stations only), for bounded typo matching.
fuzzy_trigram_index: FxHashMap<u32, Vec<u32>>,
}
#[derive(Clone, Copy)]
@ -168,6 +181,148 @@ pub fn normalize_search_text(text: &str) -> String {
result
}
/// Tokens across all of a place's search aliases (split on word and alias separators),
/// for token-AND matching where every query word must prefix-match some place token.
pub fn place_alias_tokens(search_text: &str) -> impl Iterator<Item = &str> {
search_text
.split([' ', '|'])
.filter(|token| !token.is_empty())
}
fn trigram_hash(first: char, second: char, third: char) -> u32 {
let mut hash = 2_166_136_261u32;
for ch in [first, second, third] {
hash = (hash ^ (ch as u32)).wrapping_mul(16_777_619);
}
hash
}
/// Sorted, de-duplicated padded character trigrams of `text`, for Jaccard fuzzy matching.
pub fn compute_trigrams(text: &str) -> Vec<u32> {
let norm = normalize_search_text(text);
if norm.is_empty() {
return Vec::new();
}
let chars: Vec<char> = [' ', ' ']
.into_iter()
.chain(norm.chars())
.chain(std::iter::once(' '))
.collect();
let mut grams: Vec<u32> = chars
.windows(3)
.map(|window| trigram_hash(window[0], window[1], window[2]))
.collect();
grams.sort_unstable();
grams.dedup();
grams
}
/// Intersect two ascending-sorted row-id slices.
fn intersect_sorted(left: &[u32], right: &[u32]) -> Vec<u32> {
let mut out = Vec::new();
let (mut i, mut j) = (0, 0);
while i < left.len() && j < right.len() {
match left[i].cmp(&right[j]) {
std::cmp::Ordering::Less => i += 1,
std::cmp::Ordering::Greater => j += 1,
std::cmp::Ordering::Equal => {
out.push(left[i]);
i += 1;
j += 1;
}
}
}
out
}
/// Union two ascending-sorted row-id slices (deduplicated, stays sorted).
fn union_sorted(left: &[u32], right: &[u32]) -> Vec<u32> {
let mut out = Vec::with_capacity(left.len() + right.len());
let (mut i, mut j) = (0, 0);
while i < left.len() && j < right.len() {
match left[i].cmp(&right[j]) {
std::cmp::Ordering::Less => {
out.push(left[i]);
i += 1;
}
std::cmp::Ordering::Greater => {
out.push(right[j]);
j += 1;
}
std::cmp::Ordering::Equal => {
out.push(left[i]);
i += 1;
j += 1;
}
}
}
out.extend_from_slice(&left[i..]);
out.extend_from_slice(&right[j..]);
out
}
/// Distinct indexable tokens (len ≥ 2) across all of a place's search aliases. ASCII because
/// `normalize_search_text` already dropped non-alphanumerics, so prefix byte-slicing is safe.
fn place_index_tokens(search_text: &str) -> Vec<String> {
let mut tokens: Vec<String> = place_alias_tokens(search_text)
.filter(|token| token.len() >= 2)
.map(ToString::to_string)
.collect();
tokens.sort_unstable();
tokens.dedup();
tokens
}
fn build_place_prefix_index(
token_index: &FxHashMap<String, Vec<u32>>,
) -> FxHashMap<String, Vec<String>> {
let mut prefix_index: FxHashMap<String, Vec<String>> = FxHashMap::default();
for token in token_index.keys() {
let max_len = token.len().min(PLACE_PREFIX_MAX_LEN);
for len in PLACE_PREFIX_MIN_LEN..=max_len {
prefix_index
.entry(token[..len].to_string())
.or_default()
.push(token.clone());
}
}
for tokens in prefix_index.values_mut() {
tokens.sort_unstable();
tokens.dedup();
}
prefix_index
}
/// Whether a place type participates in fuzzy (typo) matching. Settlements/stations/universities
/// do; the ~1M streets and POIs do not (people rarely misspell a road and it keeps fuzzy bounded).
fn is_fuzzy_eligible_type(place_type: &str) -> bool {
!matches!(
place_type,
"street" | "park" | "attraction" | "hospital" | "retail"
)
}
/// Jaccard similarity between two sorted trigram sets (0.01.0).
pub fn trigram_similarity(left: &[u32], right: &[u32]) -> f32 {
if left.is_empty() || right.is_empty() {
return 0.0;
}
let (mut i, mut j, mut intersection) = (0, 0, 0usize);
while i < left.len() && j < right.len() {
match left[i].cmp(&right[j]) {
std::cmp::Ordering::Less => i += 1,
std::cmp::Ordering::Greater => j += 1,
std::cmp::Ordering::Equal => {
intersection += 1;
i += 1;
j += 1;
}
}
}
let union = left.len() + right.len() - intersection;
intersection as f32 / union as f32
}
fn replace_token(text: &str, from: &str, to: &str) -> Option<String> {
let mut changed = false;
let replaced: Vec<&str> = text
@ -191,15 +346,31 @@ fn push_alias(aliases: &mut Vec<String>, alias: String) {
}
}
/// Bidirectional token abbreviations expanded into search aliases so a query typed either
/// way matches (e.g. "gt missenden" ↔ "Great Missenden", "mt" ↔ "Mount").
const PLACE_TOKEN_ALIASES: &[(&str, &str)] = &[
("st", "saint"),
("saint", "st"),
("mt", "mount"),
("mount", "mt"),
("gt", "great"),
("great", "gt"),
("lt", "little"),
("little", "lt"),
("upr", "upper"),
("upper", "upr"),
("lwr", "lower"),
("lower", "lwr"),
];
fn build_search_text(name: &str, place_type: &str) -> String {
let primary = normalize_search_text(name);
let mut aliases = vec![primary.clone()];
if let Some(alias) = replace_token(&primary, "st", "saint") {
push_alias(&mut aliases, alias);
}
if let Some(alias) = replace_token(&primary, "saint", "st") {
push_alias(&mut aliases, alias);
for (from, to) in PLACE_TOKEN_ALIASES {
if let Some(alias) = replace_token(&primary, from, to) {
push_alias(&mut aliases, alias);
}
}
if place_type == "station" {
@ -391,6 +562,26 @@ impl PlaceData {
fallback_city
};
// Build the place search index: an inverted token index over all rows (so the per-query
// cost scales with matched candidates, not the ~1M-row corpus), plus a trigram index over
// only fuzzy-eligible rows for bounded typo matching.
let mut token_index: FxHashMap<String, Vec<u32>> = FxHashMap::default();
let mut fuzzy_trigram_index: FxHashMap<u32, Vec<u32>> = FxHashMap::default();
for idx in 0..row_count {
for token in place_index_tokens(&name_search[idx]) {
token_index.entry(token).or_default().push(idx as u32);
}
if is_fuzzy_eligible_type(&place_type_raw[idx]) {
for trigram in compute_trigrams(&name[idx]) {
fuzzy_trigram_index
.entry(trigram)
.or_default()
.push(idx as u32);
}
}
}
let token_prefix_index = build_place_prefix_index(&token_index);
let with_pop = population.iter().filter(|&&pop| pop > 0).count();
let with_city = city.iter().filter(|c| c.is_some()).count();
info!(
@ -398,6 +589,8 @@ impl PlaceData {
types = place_type.values.len(),
with_population = with_pop,
with_city = with_city,
tokens = token_index.len(),
fuzzy_trigrams = fuzzy_trigram_index.len(),
"Place data loaded"
);
@ -412,14 +605,261 @@ impl PlaceData {
lon,
city,
travel_destination,
token_index,
token_prefix_index,
fuzzy_trigram_index,
})
}
/// Candidate place rows for the query content tokens: intersect the posting lists of words
/// typed in full; if none matched an indexed token exactly, seed from the smallest
/// prefix-expanded list (so a partially-typed final word still works). Bounded by
/// `PLACE_CANDIDATE_LIMIT`.
pub fn place_candidate_rows(&self, tokens: &[&str]) -> Vec<u32> {
let mut exact: Vec<&[u32]> = tokens
.iter()
.filter_map(|token| self.token_index.get(*token).map(Vec::as_slice))
.collect();
let mut rows = if exact.is_empty() {
self.place_prefix_seed(tokens)
} else {
exact.sort_by_key(|posting| posting.len());
let mut acc = exact[0].to_vec();
for posting in &exact[1..] {
if acc.is_empty() {
break;
}
acc = intersect_sorted(&acc, posting);
}
acc
};
rows.truncate(PLACE_CANDIDATE_LIMIT);
rows
}
fn place_prefix_seed(&self, tokens: &[&str]) -> Vec<u32> {
let mut best: Option<Vec<u32>> = None;
for token in tokens {
if token.len() < PLACE_PREFIX_MIN_LEN {
continue;
}
let key = &token[..token.len().min(PLACE_PREFIX_MAX_LEN)];
let Some(indexed) = self.token_prefix_index.get(key) else {
continue;
};
let mut union: Vec<u32> = Vec::new();
for indexed_token in indexed {
if !indexed_token.starts_with(token) {
continue;
}
if let Some(rows) = self.token_index.get(indexed_token) {
union = if union.is_empty() {
rows.clone()
} else {
union_sorted(&union, rows)
};
}
}
if !union.is_empty()
&& best
.as_ref()
.is_none_or(|current| union.len() < current.len())
{
best = Some(union);
}
}
best.unwrap_or_default()
}
/// Fuzzy-eligible rows sharing enough trigrams with the query to be worth Jaccard scoring.
/// Bounded by the (small) fuzzy trigram index rather than scanning every place.
pub fn fuzzy_candidate_rows(&self, query_trigrams: &[u32]) -> Vec<u32> {
if query_trigrams.is_empty() {
return Vec::new();
}
let mut counts: FxHashMap<u32, u16> = FxHashMap::default();
for trigram in query_trigrams {
if let Some(rows) = self.fuzzy_trigram_index.get(trigram) {
for &row in rows {
*counts.entry(row).or_default() += 1;
}
}
}
let min_shared = (((query_trigrams.len() as f32) * 0.4).ceil() as u16).max(1);
counts
.into_iter()
.filter_map(|(row, shared)| (shared >= min_shared).then_some(row))
.collect()
}
}
#[cfg(test)]
impl PlaceData {
/// Build a minimal PlaceData from (name, place_type) pairs for index tests.
fn from_names<S: AsRef<str>>(rows: &[(S, S)]) -> Self {
let name: Vec<String> = rows.iter().map(|(nm, _)| nm.as_ref().to_string()).collect();
let place_type_raw: Vec<String> =
rows.iter().map(|(_, pt)| pt.as_ref().to_string()).collect();
let name_lower: Vec<String> = name.iter().map(|nm| nm.to_lowercase()).collect();
let name_search: Vec<String> = name
.iter()
.zip(&place_type_raw)
.map(|(nm, pt)| build_search_text(nm, pt))
.collect();
let mut token_index: FxHashMap<String, Vec<u32>> = FxHashMap::default();
let mut fuzzy_trigram_index: FxHashMap<u32, Vec<u32>> = FxHashMap::default();
for idx in 0..name.len() {
for token in place_index_tokens(&name_search[idx]) {
token_index.entry(token).or_default().push(idx as u32);
}
if is_fuzzy_eligible_type(&place_type_raw[idx]) {
for trigram in compute_trigrams(&name[idx]) {
fuzzy_trigram_index
.entry(trigram)
.or_default()
.push(idx as u32);
}
}
}
let token_prefix_index = build_place_prefix_index(&token_index);
let len = name.len();
PlaceData {
name,
name_lower,
name_search,
place_type: InternedColumn::build(&place_type_raw),
type_rank: place_type_raw.iter().map(|pt| type_rank(pt)).collect(),
population: vec![0; len],
lat: vec![0.0; len],
lon: vec![0.0; len],
city: vec![None; len],
travel_destination: vec![false; len],
token_index,
token_prefix_index,
fuzzy_trigram_index,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn place_index_tokens_dedup_and_min_length() {
// "a" is too short; aliases split on " | ".
assert_eq!(
place_index_tokens("st albans | saint albans"),
vec!["albans".to_string(), "saint".to_string(), "st".to_string()]
);
}
#[test]
fn place_candidate_rows_intersect_and_prefix_seed() {
let pd = PlaceData::from_names(&[
("Camden", "suburb"),
("Camden Town", "suburb"),
("Camden Market", "attraction"),
("Manchester", "city"),
("Manchester Piccadilly", "station"),
]);
// Full word → posting list (Camden, Camden Town, Camden Market).
let camden = pd.place_candidate_rows(&["camden"]);
assert_eq!(camden, vec![0, 1, 2]);
// Two full words intersect to rows containing BOTH (Camden Town only).
let camden_town = pd.place_candidate_rows(&["camden", "town"]);
assert_eq!(camden_town, vec![1]);
// A partially-typed final word with no exact token seeds from the prefix index.
let piccad = pd.place_candidate_rows(&["piccad"]);
assert_eq!(piccad, vec![4]);
// No match → empty.
assert!(pd.place_candidate_rows(&["zzzz"]).is_empty());
}
// Run with: cargo test --release bench_place_search -- --ignored --nocapture
#[test]
#[ignore]
fn bench_place_search_at_one_million_rows() {
let roads = [
"High Street",
"Station Road",
"Church Lane",
"Victoria Road",
"Mill Lane",
"Park Avenue",
"Queens Road",
"Kings Road",
];
let mut rows: Vec<(String, String)> = Vec::with_capacity(1_000_000);
for i in 0..1_000_000usize {
// Vary the name so the index resembles ~1M distinct (street, area) rows.
rows.push((
format!("{} {}", roads[i % roads.len()], i % 4000),
"street".into(),
));
}
rows.push(("London".into(), "city".into()));
let pd = PlaceData::from_names(&rows);
let start = std::time::Instant::now();
let mut hits = 0usize;
for _ in 0..50 {
let candidates = pd.place_candidate_rows(&["high", "street"]);
for row in candidates {
let idx = row as usize;
if place_search_test_score(&pd, idx, "high street", &["high", "street"]).is_some() {
hits += 1;
}
}
}
let per_query = start.elapsed() / 50;
println!(
"indexed place search over {} rows: {:?}/query ({} hits)",
pd.name.len(),
per_query,
hits / 50
);
// The old full O(N) scan measured ~36ms here; candidate-based must be far under that.
assert!(per_query.as_millis() < 10, "per_query was {per_query:?}");
}
/// Mirrors the route's per-candidate match check for the bench.
fn place_search_test_score(
pd: &PlaceData,
idx: usize,
query_search: &str,
query_tokens: &[&str],
) -> Option<f32> {
let search_text = &pd.name_search[idx];
if query_tokens.iter().all(|qt| {
place_alias_tokens(search_text)
.any(|t| t == *qt || (qt.len() >= 2 && t.starts_with(qt)))
}) {
Some(640.0)
} else if pd.name_lower[idx] == query_search {
Some(1000.0)
} else {
None
}
}
#[test]
fn fuzzy_candidate_rows_finds_typos_only_for_eligible_rows() {
let pd = PlaceData::from_names(&[
("London", "city"),
("Baker Street", "street"), // not fuzzy-eligible
]);
let typo = compute_trigrams("Londn");
let candidates = pd.fuzzy_candidate_rows(&typo);
assert!(candidates.contains(&0)); // London (city) is reachable by fuzzy
assert!(!candidates.contains(&1)); // streets are excluded from the fuzzy index
}
fn test_city_rows() -> [(&'static str, f32, f32, u32); 5] {
[
("London", 51.507_446, -0.1277653, 8_908_083),
@ -470,6 +910,29 @@ mod tests {
assert!(build_search_text("Shadwell DLR station", "station").contains("shadwell station"));
}
#[test]
fn search_text_expands_directional_and_size_abbreviations() {
assert!(build_search_text("Great Missenden", "village").contains("gt missenden"));
assert!(build_search_text("Mount Pleasant", "suburb").contains("mt pleasant"));
assert!(build_search_text("Little Venice", "suburb").contains("lt venice"));
}
#[test]
fn trigram_similarity_is_high_for_typos_and_low_for_unrelated() {
let london = compute_trigrams("London");
let typo = compute_trigrams("Londn");
let other = compute_trigrams("Manchester");
assert!(trigram_similarity(&london, &typo) >= 0.4);
assert!(trigram_similarity(&london, &other) < 0.2);
assert!((trigram_similarity(&london, &london) - 1.0).abs() < 1e-6);
}
#[test]
fn place_alias_tokens_split_across_aliases() {
let tokens: Vec<&str> = place_alias_tokens("kings cross | kings x").collect();
assert_eq!(tokens, vec!["kings", "cross", "kings", "x"]);
}
#[test]
fn travel_destination_types_match_legacy_places() {
assert!(is_travel_destination_type("city"));

View file

@ -398,7 +398,7 @@ fn build_school_meta(
let mut idx = vec![u32::MAX; row_count];
let mut meta = Vec::new();
for row in 0..row_count {
for (row, meta_idx) in idx.iter_mut().enumerate().take(row_count) {
let type_group_val = fetch_str(&type_group, row);
let type_val = fetch_str(&r#type, row);
// type_group is present for every GIAS row, so use it as the sentinel
@ -406,7 +406,7 @@ fn build_school_meta(
if type_group_val.is_none() && type_val.is_none() {
continue;
}
idx[row] = meta.len() as u32;
*meta_idx = meta.len() as u32;
meta.push(SchoolMetadata {
phase: fetch_str(&phase, row),
r#type: type_val,

View file

@ -10,8 +10,10 @@ use rustc_hash::{FxHashMap, FxHashSet};
use crate::consts::{H3_PRECOMPUTE_MAX, HISTOGRAM_BINS, NAN_U16, QUANT_SCALE};
use crate::features::{self, Bounds};
const ADDRESS_SEARCH_CANDIDATE_LIMIT: usize = 50_000;
const ADDRESS_SEARCH_MAX_POSTINGS_PER_TOKEN: usize = 250_000;
/// Upper bound on rows scored per query. Intersection keeps most candidate sets far below
/// this; only a single very common road word (e.g. "high") approaches it, and the in-area
/// priority sort keeps a refined query's matches ahead of the cut.
const ADDRESS_SEARCH_CANDIDATE_LIMIT: usize = 150_000;
const ADDRESS_SEARCH_PREFIX_MIN_LEN: usize = 4;
const ADDRESS_SEARCH_PREFIX_MAX_LEN: usize = 8;
const NO_POI_METRIC_ROW: u32 = u32::MAX;
@ -162,6 +164,11 @@ struct AddressTermGroup {
#[derive(Debug)]
struct AddressQuery {
full_postcode: Option<String>,
/// Compact uppercase outward code (optionally with a sector digit) recovered when the
/// user appended a partial postcode like "NW1" or "NW1 6". Used as an additive ranking
/// bias, never as a hard filter — so the disambiguating hint is honoured without
/// excluding the same road in other areas.
postcode_area: Option<String>,
text_groups: Vec<AddressTermGroup>,
numeric_terms: Vec<String>,
candidate_terms: Vec<String>,
@ -442,6 +449,138 @@ fn build_address_prefix_index(
prefix_index
}
/// Intersect two ascending-sorted row-id slices.
fn intersect_sorted(left: &[u32], right: &[u32]) -> Vec<u32> {
let mut out = Vec::new();
let (mut i, mut j) = (0, 0);
while i < left.len() && j < right.len() {
match left[i].cmp(&right[j]) {
std::cmp::Ordering::Less => i += 1,
std::cmp::Ordering::Greater => j += 1,
std::cmp::Ordering::Equal => {
out.push(left[i]);
i += 1;
j += 1;
}
}
}
out
}
/// Union two ascending-sorted row-id slices (deduplicated, stays sorted).
fn union_sorted(left: &[u32], right: &[u32]) -> Vec<u32> {
let mut out = Vec::with_capacity(left.len() + right.len());
let (mut i, mut j) = (0, 0);
while i < left.len() && j < right.len() {
match left[i].cmp(&right[j]) {
std::cmp::Ordering::Less => {
out.push(left[i]);
i += 1;
}
std::cmp::Ordering::Greater => {
out.push(right[j]);
j += 1;
}
std::cmp::Ordering::Equal => {
out.push(left[i]);
i += 1;
j += 1;
}
}
}
out.extend_from_slice(&left[i..]);
out.extend_from_slice(&right[j..]);
out
}
/// An ordinal like "1st", "2nd", "3rd", "21st" — part of the street name ("2nd Avenue"), not a
/// house-number prefix.
fn is_ordinal_token(token: &str) -> bool {
let split = token.len().saturating_sub(2);
let (digits, suffix) = token.split_at(split);
!digits.is_empty()
&& digits.chars().all(|ch| ch.is_ascii_digit())
&& matches!(suffix, "st" | "nd" | "rd" | "th")
}
/// Leading address tokens that denote a unit/house number rather than the street itself.
fn is_house_prefix_token(token: &str) -> bool {
if is_ordinal_token(token) {
return false;
}
matches!(
token,
"flat" | "fl" | "apartment" | "apt" | "unit" | "no" | "block" | "floor" | "room"
) || token.len() == 1
|| token.chars().all(|ch| ch.is_ascii_digit())
|| (token.chars().next().is_some_and(|ch| ch.is_ascii_digit())
&& token.chars().any(|ch| ch.is_ascii_alphabetic()))
}
/// Street-level key for an address: drops the leading house-number / flat prefix so that
/// "12 Baker Street" and "5 Baker Street" collapse to a single street entry.
fn street_key(address: &str) -> String {
let tokens = tokenize_address_text(address);
let mut start = 0;
while start < tokens.len() && is_house_prefix_token(&tokens[start]) {
start += 1;
}
if start >= tokens.len() {
return tokens.join(" ");
}
tokens[start..].join(" ")
}
/// Road-type words. Their presence (with no house number) marks a road browse, which we
/// collapse to one result per street.
const ROAD_TYPE_TOKENS: &[&str] = &[
"street",
"st",
"road",
"rd",
"lane",
"ln",
"avenue",
"ave",
"close",
"cl",
"drive",
"dr",
"way",
"court",
"ct",
"crescent",
"cres",
"place",
"terrace",
"terr",
"grove",
"gardens",
"gdns",
"walk",
"row",
"square",
"sq",
"hill",
"parade",
"mews",
"embankment",
"broadway",
"boulevard",
"blvd",
];
fn query_has_road_type(query: &str) -> bool {
tokenize_address_text(query)
.iter()
.any(|token| ROAD_TYPE_TOKENS.contains(&token.as_str()))
}
/// The outward code (everything before the space) of a canonical postcode.
fn outcode_of(postcode: &str) -> &str {
postcode.split(' ').next().unwrap_or(postcode)
}
fn parse_address_query(query: &str) -> AddressQuery {
let tokens = tokenize_address_text(query);
let (full_postcode, postcode_token_indices) = extract_full_postcode(&tokens)
@ -449,12 +588,45 @@ fn parse_address_query(query: &str) -> AddressQuery {
.unwrap_or((None, Vec::new()));
let skip_postcode_tokens: FxHashSet<usize> = postcode_token_indices.into_iter().collect();
// Recover an appended partial postcode (outcode, or outcode + sector digit) as a ranking
// bias rather than discarding it — but only from the TRAILING position, so a leading road
// designation like "A4 Great West Road" is not mistaken for an area refinement.
let mut postcode_area: Option<String> = None;
let mut consumed_partial_tokens: FxHashSet<usize> = FxHashSet::default();
if full_postcode.is_none() && !tokens.is_empty() {
let last = tokens.len() - 1;
if !skip_postcode_tokens.contains(&last) {
let sector_digit =
tokens[last].len() == 1 && tokens[last].chars().all(|ch| ch.is_ascii_digit());
if last >= 1
&& sector_digit
&& !skip_postcode_tokens.contains(&(last - 1))
&& looks_like_postcode_fragment(&tokens[last - 1])
{
postcode_area = Some(format!(
"{}{}",
tokens[last - 1].to_ascii_uppercase(),
tokens[last]
));
consumed_partial_tokens.insert(last);
consumed_partial_tokens.insert(last - 1);
} else if looks_like_postcode_fragment(&tokens[last]) {
postcode_area = Some(tokens[last].to_ascii_uppercase());
consumed_partial_tokens.insert(last);
}
}
}
let mut text_groups = Vec::new();
let mut numeric_terms = Vec::new();
let mut candidate_terms = Vec::new();
for (idx, token) in tokens.iter().enumerate() {
if skip_postcode_tokens.contains(&idx) || looks_like_postcode_fragment(token) {
if skip_postcode_tokens.contains(&idx)
|| consumed_partial_tokens.contains(&idx)
|| looks_like_postcode_fragment(token)
{
continue;
}
@ -486,6 +658,7 @@ fn parse_address_query(query: &str) -> AddressQuery {
AddressQuery {
full_postcode,
postcode_area,
text_groups,
numeric_terms,
candidate_terms,
@ -897,9 +1070,15 @@ impl PropertyData {
&self.address_search_token_keys[offset..offset + length]
}
/// Search individual property addresses. Full postcode queries use a direct row index;
/// free-text queries use a small inverted index over distinctive address tokens.
pub fn search_addresses(&self, query: &str, limit: usize) -> Vec<usize> {
/// Search individual property addresses, returning `(row, score)` ranked best-first.
///
/// Candidate rows come from intersecting the posting lists of the distinctive words the
/// user typed in full (so "Cherry Hinton Road" narrows to rows containing both), unioned
/// with the exact-postcode rows when a complete postcode is present (so a postcode is a
/// boost, not an all-or-nothing gate). An appended partial postcode keeps in-area rows
/// ahead of the candidate cut and adds a scoring bias. With a road-type word and no house
/// number, results collapse to one row per street.
pub fn search_addresses(&self, query: &str, limit: usize) -> Vec<(usize, i32)> {
if limit == 0 {
return Vec::new();
}
@ -912,25 +1091,45 @@ impl PropertyData {
return Vec::new();
}
let candidate_rows: Vec<u32> = if let Some(postcode) = parsed.full_postcode.as_deref() {
self.postcode_interner
let mut candidate_rows = self.address_candidate_rows(&parsed.candidate_terms);
// A complete postcode contributes its rows too, instead of replacing the road match.
if let Some(postcode) = parsed.full_postcode.as_deref() {
if let Some(rows) = self
.postcode_interner
.get(postcode)
.and_then(|key| self.postcode_row_index.get(&key))
.map(|rows| rows.to_vec())
.unwrap_or_default()
} else if let Some(rows) = self.best_address_token_rows(&parsed.candidate_terms) {
rows.iter()
.take(ADDRESS_SEARCH_CANDIDATE_LIMIT)
.copied()
.collect()
} else {
Vec::new()
};
{
candidate_rows = if candidate_rows.is_empty() {
rows.clone()
} else {
union_sorted(&candidate_rows, rows)
};
}
}
if candidate_rows.is_empty() {
return Vec::new();
}
// When the user appended a partial postcode, keep in-area rows ahead of the cut so the
// refinement still surfaces even for very common roads. Single pass (stable partition) so
// the postcode check — which allocates — runs exactly once per candidate.
if let Some(area) = parsed.postcode_area.as_deref() {
let mut in_area = Vec::new();
let mut others = Vec::new();
for &row in &candidate_rows {
if self.row_postcode_in_area(row as usize, area) {
in_area.push(row);
} else {
others.push(row);
}
}
in_area.extend(others);
candidate_rows = in_area;
}
candidate_rows.truncate(ADDRESS_SEARCH_CANDIDATE_LIMIT);
let mut scored: Vec<(i32, usize, usize)> = candidate_rows
.into_iter()
.filter_map(|row| {
@ -948,18 +1147,29 @@ impl PropertyData {
.then(left.2.cmp(&right.2))
});
// Collapse a road browse (road-type word, no house number) to one row per street.
let collapse_streets = parsed.numeric_terms.is_empty() && query_has_road_type(query);
let mut seen = FxHashSet::default();
let mut results = Vec::with_capacity(limit);
for (_, _, row) in scored {
for (score, _, row) in scored {
let address = self.address(row).trim();
if address.is_empty() {
continue;
}
let key = format!("{}\n{}", address.to_ascii_lowercase(), self.postcode(row));
let key = if collapse_streets {
format!(
"{}\n{}",
street_key(address),
outcode_of(self.postcode(row))
)
} else {
format!("{}\n{}", address.to_ascii_lowercase(), self.postcode(row))
};
if !seen.insert(key) {
continue;
}
results.push(row);
results.push((row, score));
if results.len() == limit {
break;
}
@ -968,36 +1178,75 @@ impl PropertyData {
results
}
fn best_address_token_rows(&self, terms: &[String]) -> Option<&[u32]> {
let mut best: Option<&[u32]> = None;
for term in terms {
if let Some(rows) = self.address_token_index.get(term) {
if best.is_none_or(|current| rows.len() < current.len()) {
best = Some(rows.as_slice());
}
continue;
}
if term.len() < 4 {
continue;
}
if let Some(tokens) = self.address_prefix_index.get(address_prefix_key(term)) {
for token in tokens {
if !token.starts_with(term) {
continue;
}
if let Some(rows) = self.address_token_index.get(token) {
if best.is_none_or(|current| rows.len() < current.len()) {
best = Some(rows.as_slice());
}
}
}
/// True when the row's postcode begins with the compact partial-postcode `area`
/// (e.g. "NW1" or "NW16" matches "NW1 6XE").
fn row_postcode_in_area(&self, row: usize, area: &str) -> bool {
let mut compact = String::new();
for ch in self.postcode(row).chars() {
if !ch.is_whitespace() {
compact.push(ch.to_ascii_uppercase());
}
}
compact.starts_with(area)
}
best
/// Candidate rows for the distinctive query words. Words typed in full intersect by their
/// exact posting lists (precise); a still-being-typed final word with no exact match seeds
/// from the smallest prefix-expanded posting list (so partial typing keeps working).
fn address_candidate_rows(&self, terms: &[String]) -> Vec<u32> {
let mut exact: Vec<&[u32]> = terms
.iter()
.filter_map(|term| self.address_token_index.get(term).map(Vec::as_slice))
.collect();
if !exact.is_empty() {
exact.sort_by_key(|rows| rows.len());
let mut acc = exact[0].to_vec();
for rows in &exact[1..] {
if acc.is_empty() {
break;
}
acc = intersect_sorted(&acc, rows);
}
return acc;
}
self.prefix_seed_rows(terms)
}
/// Seed rows from the smallest prefix-expanded term — used only when no word matched an
/// indexed token exactly (i.e. the user is still typing the final word).
fn prefix_seed_rows(&self, terms: &[String]) -> Vec<u32> {
let mut best: Option<Vec<u32>> = None;
for term in terms {
if term.len() < ADDRESS_SEARCH_PREFIX_MIN_LEN {
continue;
}
let Some(tokens) = self.address_prefix_index.get(address_prefix_key(term)) else {
continue;
};
let mut union: Vec<u32> = Vec::new();
for token in tokens {
if !token.starts_with(term) {
continue;
}
if let Some(rows) = self.address_token_index.get(token) {
union = if union.is_empty() {
rows.clone()
} else {
union_sorted(&union, rows)
};
}
}
if !union.is_empty()
&& best
.as_ref()
.is_none_or(|current| union.len() < current.len())
{
best = Some(union);
}
}
best.unwrap_or_default()
}
fn address_match_score(&self, row: usize, parsed: &AddressQuery) -> Option<i32> {
@ -1037,6 +1286,12 @@ impl PropertyData {
if numeric_matches == parsed.numeric_terms.len() && numeric_matches > 0 {
score += 50;
}
// Additive bias (never a filter) when the row sits in the appended partial postcode.
if let Some(area) = parsed.postcode_area.as_deref() {
if self.row_postcode_in_area(row, area) {
score += 400;
}
}
Some(score)
}
@ -1969,16 +2224,23 @@ impl PropertyData {
}
}
}
let address_token_count_before_prune = address_token_index.len();
address_token_index.retain(|_, rows| rows.len() <= ADDRESS_SEARCH_MAX_POSTINGS_PER_TOKEN);
// Keep every distinctive token: common road words ("high", "church", "station") are
// exactly what people search, and dropping them made those roads unsearchable while a
// prefix fallback surfaced the wrong street ("Highbury" for "High"). The candidate scan
// is bounded per query instead (ADDRESS_SEARCH_CANDIDATE_LIMIT), and stop words are
// already excluded from the index, so the largest posting lists stay modest.
let max_postings = address_token_index
.values()
.map(Vec::len)
.max()
.unwrap_or(0);
let address_prefix_index = build_address_prefix_index(&address_token_index);
let address_search_interner = address_search_rodeo.into_reader();
let address_postings_count: usize = address_token_index.values().map(Vec::len).sum();
tracing::info!(
tokens = address_token_index.len(),
prefixes = address_prefix_index.len(),
pruned_tokens =
address_token_count_before_prune.saturating_sub(address_token_index.len()),
max_postings_per_token = max_postings,
postings = address_postings_count,
row_tokens = address_search_token_keys.len(),
"Address search index built"
@ -2340,6 +2602,79 @@ mod tests {
assert_eq!(parsed.candidate_terms, vec!["downing".to_string()]);
}
#[test]
fn address_query_recovers_appended_partial_postcode_as_bias() {
let parsed = parse_address_query("Baker Street NW1");
assert_eq!(parsed.full_postcode, None);
assert_eq!(parsed.postcode_area.as_deref(), Some("NW1"));
// The road words are still searchable; the postcode fragment did not consume them.
assert_eq!(parsed.candidate_terms, vec!["baker".to_string()]);
assert!(parsed.numeric_terms.is_empty());
}
#[test]
fn address_query_recovers_outcode_plus_sector_without_a_phantom_house_number() {
let parsed = parse_address_query("High Street CR0 2");
assert_eq!(parsed.postcode_area.as_deref(), Some("CR02"));
// The lone sector digit must not be treated as a house number.
assert!(parsed.numeric_terms.is_empty());
assert_eq!(parsed.candidate_terms, vec!["high".to_string()]);
}
#[test]
fn full_postcode_takes_precedence_over_partial_bias() {
let parsed = parse_address_query("Baker Street NW1 6XE");
assert_eq!(parsed.full_postcode.as_deref(), Some("NW1 6XE"));
assert_eq!(parsed.postcode_area, None);
}
#[test]
fn intersect_and_union_sorted_row_ids() {
assert_eq!(
intersect_sorted(&[1, 2, 3, 5], &[2, 3, 4, 5]),
vec![2, 3, 5]
);
assert_eq!(intersect_sorted(&[1, 2], &[3, 4]), Vec::<u32>::new());
assert_eq!(union_sorted(&[1, 3, 5], &[2, 3, 4]), vec![1, 2, 3, 4, 5]);
assert_eq!(union_sorted(&[], &[2, 4]), vec![2, 4]);
}
#[test]
fn street_key_collapses_house_numbers_and_flats() {
assert_eq!(street_key("12 Baker Street"), "baker street");
assert_eq!(street_key("5 Baker Street"), "baker street");
assert_eq!(street_key("Flat 2, 10 Downing Street"), "downing street");
assert_eq!(street_key("221B Baker Street"), "baker street");
}
#[test]
fn street_key_keeps_ordinal_street_names() {
// Ordinals are part of the street name, not a house-number prefix.
assert_eq!(street_key("2nd Avenue"), "2nd avenue");
assert_eq!(street_key("12 3rd Avenue"), "3rd avenue");
assert!(is_ordinal_token("21st"));
assert!(!is_ordinal_token("21"));
assert!(!is_ordinal_token("221b"));
}
#[test]
fn postcode_area_recovered_only_from_the_trailing_position() {
// A leading road designation must NOT be taken as an area refinement.
let parsed = parse_address_query("A4 Great West Road");
assert_eq!(parsed.postcode_area, None);
// A genuine trailing outcode still is.
let trailing = parse_address_query("Great West Road W4");
assert_eq!(trailing.postcode_area.as_deref(), Some("W4"));
}
#[test]
fn road_type_detection() {
assert!(query_has_road_type("high street"));
assert!(query_has_road_type("acacia avenue"));
assert!(!query_has_road_type("acacia"));
assert!(!query_has_road_type("london"));
}
#[test]
fn address_query_parsing_keeps_partial_terms_for_row_matching() {
let parsed = parse_address_query("settlers cour");

View file

@ -507,8 +507,7 @@ async fn main() -> anyhow::Result<()> {
"property_borders.pmtiles",
);
let noise_overlay_reader =
init_required_tile_reader("Noise", &noise_overlay_tiles).await?;
let noise_overlay_reader = init_required_tile_reader("Noise", &noise_overlay_tiles).await?;
let satellite_reader = init_required_tile_reader("Satellite", &satellite_tiles).await?;
let satellite_highres_reader =
init_required_tile_reader("Satellite high-res", &satellite_highres_tiles).await?;

View file

@ -2,14 +2,26 @@ use std::sync::Arc;
use axum::extract::{Query, State};
use axum::response::Json;
use rustc_hash::FxHashSet;
use serde::{Deserialize, Serialize};
use tracing::info;
use crate::api_error::ApiError;
use crate::consts::PLACES_LIMIT;
use crate::data::{normalize_search_text, slugify};
use crate::data::{
compute_trigrams, normalize_search_text, place_alias_tokens, slugify, trigram_similarity,
};
use crate::state::SharedState;
/// Trailing connective words dropped from a place query so "fish and chips" matches a place
/// stored (after `&` is normalized away) as "fish chips".
const QUERY_STOP_WORDS: &[&str] = &["and", "the", "of"];
/// Minimum trigram similarity for a fuzzy place match.
const FUZZY_MIN_SIMILARITY: f32 = 0.42;
/// Run the (linear) fuzzy pass only when the exact passes found fewer than this.
const FUZZY_TRIGGER_BELOW: usize = 3;
#[derive(Serialize)]
pub struct PlaceResult {
name: String,
@ -29,6 +41,43 @@ pub struct AddressResult {
lon: f32,
}
/// A single, category-tagged, relevance-scored result. The frontend renders these in order,
/// so ranking is unified across places, outcodes, postcodes and addresses instead of the old
/// fixed positional bucketing.
#[derive(Serialize)]
#[serde(tag = "type", rename_all = "lowercase")]
pub enum UnifiedResult {
Place {
name: String,
slug: String,
place_type: String,
lat: f32,
lon: f32,
#[serde(skip_serializing_if = "Option::is_none")]
city: Option<String>,
score: f32,
},
Postcode {
label: String,
score: f32,
},
Address {
address: String,
postcode: String,
lat: f32,
lon: f32,
score: f32,
},
}
fn unified_score(result: &UnifiedResult) -> f32 {
match result {
UnifiedResult::Place { score, .. }
| UnifiedResult::Postcode { score, .. }
| UnifiedResult::Address { score, .. } => *score,
}
}
#[derive(Serialize)]
pub struct PlacesResponse {
places: Vec<PlaceResult>,
@ -36,6 +85,9 @@ pub struct PlacesResponse {
postcodes: Vec<String>,
#[serde(skip_serializing_if = "Vec::is_empty")]
addresses: Vec<AddressResult>,
/// Unified, relevance-ordered results. Preferred by the frontend; the arrays above remain
/// for backward compatibility.
results: Vec<UnifiedResult>,
}
#[derive(Deserialize)]
@ -44,6 +96,9 @@ pub struct PlacesParams {
q: String,
/// If set, only return places that have travel time data for this mode.
mode: Option<String>,
/// Optional map-viewport centre used to bias ranking toward what the user is looking at.
lat: Option<f32>,
lng: Option<f32>,
}
fn compact_postcode_query(query: &str) -> String {
@ -93,6 +148,131 @@ fn postcode_starts_with_compact(postcode: &str, compact_query: &str) -> bool {
current.is_none()
}
fn is_postcode_fragmentish(token: &str) -> bool {
(2..=4).contains(&token.len())
&& token
.chars()
.next()
.is_some_and(|ch| ch.is_ascii_alphabetic())
&& token.chars().any(|ch| ch.is_ascii_digit())
&& token.chars().all(|ch| ch.is_ascii_alphanumeric())
}
/// Peel a trailing geographic refinement (outcode, or outcode + sector digit) off the query.
/// "camden nw1" → ("camden", Some("NW1")); the core matches the place, the refinement biases
/// ranking and drives the outcode/postcode lists — instead of breaking the match entirely.
fn split_geographic_refinement(query: &str) -> (String, Option<String>) {
let words: Vec<&str> = query.split_whitespace().collect();
if words.len() < 2 {
return (query.to_string(), None);
}
let last = words[words.len() - 1];
if words.len() >= 3 && last.len() == 1 && last.chars().all(|ch| ch.is_ascii_digit()) {
let prev = words[words.len() - 2];
if is_postcode_fragmentish(prev) {
let area = format!("{}{}", prev.to_ascii_uppercase(), last);
return (words[..words.len() - 2].join(" "), Some(area));
}
}
if is_postcode_fragmentish(last) {
return (
words[..words.len() - 1].join(" "),
Some(last.to_ascii_uppercase()),
);
}
(query.to_string(), None)
}
/// Content words of a place query, dropping connectives so "fish and chips" matches "Fish & Chips".
fn query_content_tokens(query_search: &str) -> Vec<&str> {
query_search
.split(' ')
.filter(|token| !token.is_empty() && !QUERY_STOP_WORDS.contains(token))
.collect()
}
/// Base relevance tier for a place, or None if it does not match at all.
fn place_base_score(
search_text: &str,
name_lower: &str,
query_search: &str,
query_lower: &str,
query_tokens: &[&str],
) -> Option<f32> {
if query_search.is_empty() {
return None;
}
let mut exact = name_lower == query_lower;
let mut prefix = name_lower.starts_with(query_lower);
for alias in search_text.split(" | ") {
if alias == query_search {
exact = true;
}
if alias.starts_with(query_search) {
prefix = true;
}
}
if exact {
return Some(1000.0);
}
if prefix {
return Some(820.0);
}
if !query_tokens.is_empty() {
let all_covered = query_tokens.iter().all(|query_token| {
place_alias_tokens(search_text).any(|token| {
token == *query_token || (query_token.len() >= 2 && token.starts_with(query_token))
})
});
if all_covered {
return Some(640.0);
}
}
None
}
/// Small additive bonuses: more important place types and bigger populations rank higher.
fn place_modifiers(type_rank: u8, population: u32) -> f32 {
let type_bonus = f32::from(6u8.saturating_sub(type_rank)) * 8.0;
let pop_bonus = (population as f32 + 1.0).ln() * 4.0;
type_bonus + pop_bonus.min(64.0)
}
/// Distance-decay bonus toward the viewport / refinement centre. Capped below the gap between
/// match tiers so it reorders within a tier and breaks ties without overriding exact matches.
fn proximity_bonus(center: Option<(f32, f32)>, lat: f32, lon: f32) -> f32 {
let Some((center_lat, center_lon)) = center else {
return 0.0;
};
let dlat = lat - center_lat;
let dlon = (lon - center_lon) * center_lat.to_radians().cos();
let dist = (dlat * dlat + dlon * dlon).sqrt();
160.0 * (-dist / 0.3).exp()
}
/// Map an address match's raw specificity score onto the unified scale.
fn address_unified_score(raw: i32) -> f32 {
460.0 + raw.min(1000) as f32 * 0.47
}
/// Resolve the outcode a compact partial postcode sits in (e.g. "NW16" → "nw1"), trying
/// progressively shorter prefixes against the known outcode set. Returns its index.
fn resolve_outcode_idx(name_lower: &[String], area: &str) -> Option<usize> {
let area_lower = area.to_lowercase();
let mut len = area_lower.len();
while len >= 2 {
let candidate = &area_lower[..len];
if let Some(idx) = name_lower.iter().position(|name| name == candidate) {
return Some(idx);
}
len -= 1;
}
None
}
pub async fn get_places(
State(shared): State<Arc<SharedState>>,
Query(params): Query<PlacesParams>,
@ -106,154 +286,229 @@ pub async fn get_places(
let limit = PLACES_LIMIT;
let mode_filter = params.mode;
let viewport = match (params.lat, params.lng) {
(Some(lat), Some(lng)) => Some((lat, lng)),
_ => None,
};
let places = tokio::task::spawn_blocking(move || {
let response = tokio::task::spawn_blocking(move || {
let t0 = std::time::Instant::now();
let query_lower = query.to_lowercase();
let query_search = normalize_search_text(&query);
let pd = &state.place_data;
let od = &state.outcode_data;
let postcode_data = &state.postcode_data;
let tt_store = &state.travel_time_store;
let property_data = &state.data;
// Linear scan — ~50-100k rows, <1ms
// Tuple: (row_idx, is_exact, is_prefix, type_rank, population, name_len, slug)
let mut matches: Vec<(usize, bool, bool, u8, u32, usize, String)> = pd
.name_search
.iter()
.enumerate()
.filter_map(|(idx, search_text)| {
if query_search.is_empty() || !search_text.contains(&query_search) {
return None;
}
let slug = slugify(&pd.name[idx]);
// Peel any appended outcode/partial-postcode so the place text matches on the core
// words while the refinement biases ranking and drives the outcode/postcode lists.
let (split_query, refinement) = split_geographic_refinement(&query);
// Only honour the refinement when it resolves to a real outcode; otherwise (e.g. "the o2",
// where "o2" looks postcode-ish but is not an outcode) treat the whole query as place text.
let refinement_outcode = refinement
.as_deref()
.and_then(|area| resolve_outcode_idx(&od.name_lower, area));
let place_query = if refinement.is_some() && refinement_outcode.is_none() {
query.clone()
} else {
split_query
};
let query_search = normalize_search_text(&place_query);
let query_lower = place_query.to_lowercase();
let query_tokens = query_content_tokens(&query_search);
// If mode filter is set, keep the historical travel destination set only.
if let Some(ref mode) = mode_filter {
if !pd.travel_destination[idx] || !tt_store.has_destination(mode, &slug) {
return None;
}
}
// Bias centre: explicit viewport, else the resolved refinement outcode's centroid.
let bias_center = viewport.or_else(|| refinement_outcode.map(|idx| od.centroids[idx]));
let is_exact = search_text
.split(" | ")
.any(|alias| alias == query_search || pd.name_lower[idx] == query_lower);
let is_prefix = search_text
.split(" | ")
.any(|alias| alias.starts_with(&query_search))
|| pd.name_lower[idx].starts_with(&query_lower);
Some((
idx,
is_exact,
is_prefix,
pd.type_rank[idx],
pd.population[idx],
pd.name[idx].len(),
slug,
))
// ---- Places: candidate rows from the inverted token index, then exact/prefix/token-AND
// scoring — bounded by matched candidates, not the ~1M-row corpus. Fuzzy fallback uses the
// (small) trigram index over fuzzy-eligible rows only.
let mut place_results: Vec<(f32, PlaceResult)> = Vec::new();
let mut matched_place_idx: FxHashSet<usize> = FxHashSet::default();
let make_place = |idx: usize| PlaceResult {
name: pd.name[idx].clone(),
slug: slugify(&pd.name[idx]),
place_type: pd.place_type.get(idx).to_string(),
lat: pd.lat[idx],
lon: pd.lon[idx],
city: pd.city[idx].clone(),
};
let passes_mode = |idx: usize| {
mode_filter.as_ref().is_none_or(|mode| {
pd.travel_destination[idx]
&& tt_store.has_destination(mode, &slugify(&pd.name[idx]))
})
.collect();
};
// Sort: exact first, then prefix, then type rank asc, then population desc, then name length asc
matches.sort_unstable_by(|lhs, rhs| {
rhs.1
.cmp(&lhs.1)
.then(rhs.2.cmp(&lhs.2))
.then(lhs.3.cmp(&rhs.3))
.then(rhs.4.cmp(&lhs.4))
.then(lhs.5.cmp(&rhs.5))
});
for row in pd.place_candidate_rows(&query_tokens) {
let idx = row as usize;
let Some(base) = place_base_score(
&pd.name_search[idx],
&pd.name_lower[idx],
&query_search,
&query_lower,
&query_tokens,
) else {
continue;
};
if !passes_mode(idx) {
continue;
}
let score = base
+ place_modifiers(pd.type_rank[idx], pd.population[idx])
+ proximity_bonus(bias_center, pd.lat[idx], pd.lon[idx]);
matched_place_idx.insert(idx);
place_results.push((score, make_place(idx)));
}
matches.truncate(limit);
// Fuzzy (trigram) fallback only when the exact passes were thin and the query is long
// enough to be discriminating.
if place_results.len() < FUZZY_TRIGGER_BELOW && query_search.len() >= 4 {
let query_trigrams = compute_trigrams(&place_query);
for row in pd.fuzzy_candidate_rows(&query_trigrams) {
let idx = row as usize;
if matched_place_idx.contains(&idx) || !passes_mode(idx) {
continue;
}
let similarity =
trigram_similarity(&query_trigrams, &compute_trigrams(&pd.name[idx]));
if similarity < FUZZY_MIN_SIMILARITY {
continue;
}
let score = 280.0
+ similarity * 120.0
+ place_modifiers(pd.type_rank[idx], pd.population[idx])
+ proximity_bonus(bias_center, pd.lat[idx], pd.lon[idx]);
matched_place_idx.insert(idx);
place_results.push((score, make_place(idx)));
}
}
let mut results: Vec<PlaceResult> = matches
.iter()
.map(|(idx, .., slug)| PlaceResult {
name: pd.name[*idx].clone(),
slug: slug.clone(),
place_type: pd.place_type.get(*idx).to_string(),
lat: pd.lat[*idx],
lon: pd.lon[*idx],
city: pd.city[*idx].clone(),
})
.collect();
// Also search outcodes (skip when mode filter is set — outcodes aren't travel destinations)
if mode_filter.is_none() {
let query_upper = query_lower.to_uppercase();
let mut outcode_results: Vec<PlaceResult> = od
.name_lower
.iter()
.enumerate()
.filter_map(|(idx, name)| {
if !name.starts_with(&query_lower) {
return None;
}
let is_exact = name.len() == query_lower.len();
Some((idx, is_exact))
})
.collect::<Vec<_>>()
.into_iter()
.map(|(idx, _is_exact)| PlaceResult {
// ---- Outcodes (skipped under a mode filter) ----
let push_outcode = |results: &mut Vec<(f32, PlaceResult)>, idx: usize, base: f32| {
let (clat, clon) = od.centroids[idx];
results.push((
base + proximity_bonus(bias_center, clat, clon),
PlaceResult {
name: od.names[idx].clone(),
slug: od.names[idx].to_lowercase(),
place_type: "outcode".to_string(),
lat: od.centroids[idx].0,
lon: od.centroids[idx].1,
lat: clat,
lon: clon,
city: od.cities[idx].clone(),
})
.collect();
// Sort outcodes: exact first, then by name length (shorter = broader area)
outcode_results.sort_unstable_by(|a, b| {
let a_exact = a.name.eq_ignore_ascii_case(&query_upper);
let b_exact = b.name.eq_ignore_ascii_case(&query_upper);
b_exact.cmp(&a_exact).then(a.name.len().cmp(&b.name.len()))
});
// Prepend outcode results (up to 3) before place results, keeping total ≤ limit
outcode_results.truncate(3);
let place_slots = limit.saturating_sub(outcode_results.len());
results.truncate(place_slots);
outcode_results.append(&mut results);
results = outcode_results;
},
));
};
if mode_filter.is_none() {
if let Some(idx) = refinement_outcode {
// A refinement ("camden nw1") resolves to exactly one outcode — no NW10/NW11 noise.
push_outcode(&mut place_results, idx, 980.0);
} else if looks_like_postcode_prefix(&query) {
// A bare postcode-prefix query ("e1") lists matching outcodes (e1, e10, e11, ...).
let area_lower = compact_postcode_query(&query).to_lowercase();
for idx in 0..od.names.len() {
let name = &od.name_lower[idx];
let is_exact = *name == area_lower;
if !(name.starts_with(&area_lower) || area_lower.starts_with(name.as_str())) {
continue;
}
push_outcode(
&mut place_results,
idx,
if is_exact { 980.0 } else { 760.0 },
);
}
}
}
let postcodes: Vec<String> = if mode_filter.is_none() && looks_like_postcode_prefix(&query)
{
let compact_query = compact_postcode_query(&query);
postcode_data
.postcodes
.iter()
.filter(|postcode| postcode_starts_with_compact(postcode, &compact_query))
.filter(|postcode| !property_data.rows_for_postcode(postcode).is_empty())
.take(limit)
.cloned()
.collect()
} else {
Vec::new()
};
place_results.sort_by(|left, right| right.0.total_cmp(&left.0));
place_results.truncate(limit);
let addresses: Vec<AddressResult> = if mode_filter.is_none() {
property_data
.search_addresses(&query, limit)
.into_iter()
.map(|row| AddressResult {
address: property_data.address(row).trim().to_string(),
postcode: property_data.postcode(row).to_string(),
lat: property_data.lat[row],
lon: property_data.lon[row],
})
.collect()
} else {
Vec::new()
};
// ---- Postcodes (full-postcode prefix list) ----
let mut postcode_results: Vec<(f32, String)> = Vec::new();
if mode_filter.is_none() && looks_like_postcode_prefix(&query) {
let compact_query = compact_postcode_query(&query);
for postcode in &postcode_data.postcodes {
if !postcode_starts_with_compact(postcode, &compact_query) {
continue;
}
if property_data.rows_for_postcode(postcode).is_empty() {
continue;
}
let compact_pc: String =
postcode.chars().filter(|ch| !ch.is_whitespace()).collect();
let score = if compact_pc == compact_query {
960.0
} else {
900.0
};
postcode_results.push((score, postcode.clone()));
if postcode_results.len() >= limit {
break;
}
}
}
postcode_results.sort_by(|left, right| right.0.total_cmp(&left.0));
// ---- Addresses ----
let mut address_results: Vec<(f32, AddressResult)> = Vec::new();
if mode_filter.is_none() {
for (row, raw) in property_data.search_addresses(&query, limit) {
let lat = property_data.lat[row];
let lon = property_data.lon[row];
let score = address_unified_score(raw) + proximity_bonus(bias_center, lat, lon);
address_results.push((
score,
AddressResult {
address: property_data.address(row).trim().to_string(),
postcode: property_data.postcode(row).to_string(),
lat,
lon,
},
));
}
}
address_results.sort_by(|left, right| right.0.total_cmp(&left.0));
// ---- Unified merge: one relevance-ordered list across every source ----
let mut unified: Vec<UnifiedResult> = Vec::new();
for (score, place) in &place_results {
unified.push(UnifiedResult::Place {
name: place.name.clone(),
slug: place.slug.clone(),
place_type: place.place_type.clone(),
lat: place.lat,
lon: place.lon,
city: place.city.clone(),
score: *score,
});
}
for (score, postcode) in &postcode_results {
unified.push(UnifiedResult::Postcode {
label: postcode.clone(),
score: *score,
});
}
for (score, address) in &address_results {
unified.push(UnifiedResult::Address {
address: address.address.clone(),
postcode: address.postcode.clone(),
lat: address.lat,
lon: address.lon,
score: *score,
});
}
unified.sort_by(|left, right| unified_score(right).total_cmp(&unified_score(left)));
unified.truncate(limit);
let places: Vec<PlaceResult> = place_results.into_iter().map(|(_, p)| p).collect();
let postcodes: Vec<String> = postcode_results.into_iter().map(|(_, p)| p).collect();
let addresses: Vec<AddressResult> = address_results.into_iter().map(|(_, a)| a).collect();
let elapsed = t0.elapsed();
info!(
query = query.as_str(),
results = results.len(),
results = unified.len(),
places = places.len(),
postcodes = postcodes.len(),
addresses = addresses.len(),
scanned = pd.name_lower.len(),
@ -262,16 +517,17 @@ pub async fn get_places(
"GET /api/places"
);
(results, postcodes, addresses)
PlacesResponse {
places,
postcodes,
addresses,
results: unified,
}
})
.await
.map_err(|error| ApiError::Internal(error.to_string()))?;
Ok(Json(PlacesResponse {
places: places.0,
postcodes: places.1,
addresses: places.2,
}))
Ok(Json(response))
}
#[cfg(test)]
@ -293,4 +549,88 @@ mod tests {
assert!(postcode_starts_with_compact("SW1A 1AA", "SW1A1"));
assert!(!postcode_starts_with_compact("SW1A 1AA", "SW1A2"));
}
#[test]
fn refinement_splits_off_trailing_outcode() {
assert_eq!(
split_geographic_refinement("camden nw1"),
("camden".to_string(), Some("NW1".to_string()))
);
assert_eq!(
split_geographic_refinement("high street cr0 2"),
("high street".to_string(), Some("CR02".to_string()))
);
// A bare outcode is not split (handled by the outcode/postcode path directly).
assert_eq!(
split_geographic_refinement("e14"),
("e14".to_string(), None)
);
// No trailing postcode → unchanged.
assert_eq!(
split_geographic_refinement("baker street"),
("baker street".to_string(), None)
);
}
#[test]
fn query_tokens_drop_connectives() {
assert_eq!(
query_content_tokens("fish and chips"),
vec!["fish", "chips"]
);
assert_eq!(query_content_tokens("isle of dogs"), vec!["isle", "dogs"]);
}
fn base(search: &str, query: &str) -> Option<f32> {
let q = normalize_search_text(query);
let tokens = query_content_tokens(&q);
place_base_score(search, search, &q, &query.to_lowercase(), &tokens)
}
#[test]
fn place_match_tiers_order_exact_above_prefix_above_token_and() {
let exact = base("camden", "camden").unwrap();
let prefix = base("camden town", "camden").unwrap();
let token_and = base("camden market", "market camden").unwrap();
assert!(exact > prefix);
assert!(prefix > token_and);
// A reordered multi-word query still matches via token-AND.
assert!(base("manchester piccadilly", "piccadilly manchester").is_some());
// Pure infix substrings no longer match (candidates are token-based): "ford" must not
// surface "Stratford" — that was the old population-dominated noise.
assert!(base("stratford", "ford").is_none());
// Appended noise that matches nothing yields no match (the route strips postcodes first).
assert!(base("camden", "camden zzzz").is_none());
}
#[test]
fn address_full_postcode_outranks_an_outcode_prefix() {
// raw 1200 ≈ road + full postcode + number; outcode prefix base is 760.
assert!(address_unified_score(1200) > 760.0);
// a road-only address (raw 200) ranks below an outcode prefix.
assert!(address_unified_score(200) < 760.0);
assert!(address_unified_score(1200) > address_unified_score(200));
}
#[test]
fn proximity_bonus_decays_and_never_flips_match_tiers() {
let here = proximity_bonus(Some((51.5, -0.1)), 51.5, -0.1);
let far = proximity_bonus(Some((51.5, -0.1)), 53.5, -2.0);
assert!(here > far);
assert!(here <= 160.0);
// Smaller than the 180-pt gap between exact (1000) and prefix (820).
assert!(here < 180.0);
assert_eq!(proximity_bonus(None, 51.5, -0.1), 0.0);
}
#[test]
fn resolve_outcode_idx_handles_sectorised_area_and_unknown() {
let names = vec!["nw1".to_string(), "e14".to_string()];
// "NW16" → outcode NW1 (strips the sector digit); "E14" → exact.
assert_eq!(resolve_outcode_idx(&names, "NW16"), Some(0));
assert_eq!(resolve_outcode_idx(&names, "E14"), Some(1));
// A postcode-ish token that is not a real outcode resolves to nothing (folds back).
assert_eq!(resolve_outcode_idx(&names, "O2"), None);
assert_eq!(resolve_outcode_idx(&names, "ZZ9"), None);
}
}