idk
This commit is contained in:
parent
a04ac2d857
commit
d43da9708c
47 changed files with 4120 additions and 573 deletions
|
|
@ -37,7 +37,9 @@ where
|
|||
|
||||
pub use actual_listings::{ActualListing, ActualListingData};
|
||||
pub use crime_by_year::CrimeByYearData;
|
||||
pub use places::{normalize_search_text, PlaceData};
|
||||
pub use places::{
|
||||
compute_trigrams, normalize_search_text, place_alias_tokens, trigram_similarity, PlaceData,
|
||||
};
|
||||
pub use poi::{resolve_poi_category_filter, POICategoryGroup, POIData, SchoolMetadata};
|
||||
pub use postcodes::{OutcodeData, PostcodeData};
|
||||
pub use property::{
|
||||
|
|
|
|||
|
|
@ -120,7 +120,7 @@ impl CrimeByYearData {
|
|||
.list()
|
||||
.with_context(|| format!("Column '{col_name}' is not a list"))?;
|
||||
|
||||
for row in 0..row_count {
|
||||
for (row, postcode) in postcode_values.iter().enumerate().take(row_count) {
|
||||
let Some(inner) = list_ca.get_as_series(row) else {
|
||||
continue;
|
||||
};
|
||||
|
|
@ -163,7 +163,7 @@ impl CrimeByYearData {
|
|||
points.sort_by_key(|p| p.year);
|
||||
|
||||
series_by_postcode
|
||||
.entry(postcode_values[row].clone())
|
||||
.entry(postcode.clone())
|
||||
.or_default()
|
||||
.push(PostcodeCrimeSeries {
|
||||
type_idx: type_idx as u16,
|
||||
|
|
|
|||
|
|
@ -4,10 +4,16 @@ use anyhow::Context;
|
|||
use polars::frame::DataFrame;
|
||||
use polars::lazy::frame::LazyFrame;
|
||||
use polars::prelude::*;
|
||||
use rustc_hash::FxHashMap;
|
||||
use tracing::info;
|
||||
|
||||
use crate::utils::InternedColumn;
|
||||
|
||||
/// Upper bound on place rows scored per query (candidate sets are normally far smaller).
|
||||
const PLACE_CANDIDATE_LIMIT: usize = 50_000;
|
||||
const PLACE_PREFIX_MIN_LEN: usize = 2;
|
||||
const PLACE_PREFIX_MAX_LEN: usize = 6;
|
||||
|
||||
pub struct PlaceData {
|
||||
pub name: Vec<String>,
|
||||
pub name_lower: Vec<String>,
|
||||
|
|
@ -19,6 +25,13 @@ pub struct PlaceData {
|
|||
pub lon: Vec<f32>,
|
||||
pub city: Vec<Option<String>>,
|
||||
pub travel_destination: Vec<bool>,
|
||||
/// Inverted index from an alias token to the (ascending) place rows containing it. Lets place
|
||||
/// search gather candidates instead of scanning all ~1M+ rows per keystroke.
|
||||
token_index: FxHashMap<String, Vec<u32>>,
|
||||
/// Prefix → indexed tokens, for matching a partially-typed final word.
|
||||
token_prefix_index: FxHashMap<String, Vec<String>>,
|
||||
/// Trigram → fuzzy-eligible rows (settlements/stations only), for bounded typo matching.
|
||||
fuzzy_trigram_index: FxHashMap<u32, Vec<u32>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
|
|
@ -168,6 +181,148 @@ pub fn normalize_search_text(text: &str) -> String {
|
|||
result
|
||||
}
|
||||
|
||||
/// Tokens across all of a place's search aliases (split on word and alias separators),
|
||||
/// for token-AND matching where every query word must prefix-match some place token.
|
||||
pub fn place_alias_tokens(search_text: &str) -> impl Iterator<Item = &str> {
|
||||
search_text
|
||||
.split([' ', '|'])
|
||||
.filter(|token| !token.is_empty())
|
||||
}
|
||||
|
||||
fn trigram_hash(first: char, second: char, third: char) -> u32 {
|
||||
let mut hash = 2_166_136_261u32;
|
||||
for ch in [first, second, third] {
|
||||
hash = (hash ^ (ch as u32)).wrapping_mul(16_777_619);
|
||||
}
|
||||
hash
|
||||
}
|
||||
|
||||
/// Sorted, de-duplicated padded character trigrams of `text`, for Jaccard fuzzy matching.
|
||||
pub fn compute_trigrams(text: &str) -> Vec<u32> {
|
||||
let norm = normalize_search_text(text);
|
||||
if norm.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
let chars: Vec<char> = [' ', ' ']
|
||||
.into_iter()
|
||||
.chain(norm.chars())
|
||||
.chain(std::iter::once(' '))
|
||||
.collect();
|
||||
let mut grams: Vec<u32> = chars
|
||||
.windows(3)
|
||||
.map(|window| trigram_hash(window[0], window[1], window[2]))
|
||||
.collect();
|
||||
grams.sort_unstable();
|
||||
grams.dedup();
|
||||
grams
|
||||
}
|
||||
|
||||
/// Intersect two ascending-sorted row-id slices.
|
||||
fn intersect_sorted(left: &[u32], right: &[u32]) -> Vec<u32> {
|
||||
let mut out = Vec::new();
|
||||
let (mut i, mut j) = (0, 0);
|
||||
while i < left.len() && j < right.len() {
|
||||
match left[i].cmp(&right[j]) {
|
||||
std::cmp::Ordering::Less => i += 1,
|
||||
std::cmp::Ordering::Greater => j += 1,
|
||||
std::cmp::Ordering::Equal => {
|
||||
out.push(left[i]);
|
||||
i += 1;
|
||||
j += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// Union two ascending-sorted row-id slices (deduplicated, stays sorted).
|
||||
fn union_sorted(left: &[u32], right: &[u32]) -> Vec<u32> {
|
||||
let mut out = Vec::with_capacity(left.len() + right.len());
|
||||
let (mut i, mut j) = (0, 0);
|
||||
while i < left.len() && j < right.len() {
|
||||
match left[i].cmp(&right[j]) {
|
||||
std::cmp::Ordering::Less => {
|
||||
out.push(left[i]);
|
||||
i += 1;
|
||||
}
|
||||
std::cmp::Ordering::Greater => {
|
||||
out.push(right[j]);
|
||||
j += 1;
|
||||
}
|
||||
std::cmp::Ordering::Equal => {
|
||||
out.push(left[i]);
|
||||
i += 1;
|
||||
j += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
out.extend_from_slice(&left[i..]);
|
||||
out.extend_from_slice(&right[j..]);
|
||||
out
|
||||
}
|
||||
|
||||
/// Distinct indexable tokens (len ≥ 2) across all of a place's search aliases. ASCII because
|
||||
/// `normalize_search_text` already dropped non-alphanumerics, so prefix byte-slicing is safe.
|
||||
fn place_index_tokens(search_text: &str) -> Vec<String> {
|
||||
let mut tokens: Vec<String> = place_alias_tokens(search_text)
|
||||
.filter(|token| token.len() >= 2)
|
||||
.map(ToString::to_string)
|
||||
.collect();
|
||||
tokens.sort_unstable();
|
||||
tokens.dedup();
|
||||
tokens
|
||||
}
|
||||
|
||||
fn build_place_prefix_index(
|
||||
token_index: &FxHashMap<String, Vec<u32>>,
|
||||
) -> FxHashMap<String, Vec<String>> {
|
||||
let mut prefix_index: FxHashMap<String, Vec<String>> = FxHashMap::default();
|
||||
for token in token_index.keys() {
|
||||
let max_len = token.len().min(PLACE_PREFIX_MAX_LEN);
|
||||
for len in PLACE_PREFIX_MIN_LEN..=max_len {
|
||||
prefix_index
|
||||
.entry(token[..len].to_string())
|
||||
.or_default()
|
||||
.push(token.clone());
|
||||
}
|
||||
}
|
||||
for tokens in prefix_index.values_mut() {
|
||||
tokens.sort_unstable();
|
||||
tokens.dedup();
|
||||
}
|
||||
prefix_index
|
||||
}
|
||||
|
||||
/// Whether a place type participates in fuzzy (typo) matching. Settlements/stations/universities
|
||||
/// do; the ~1M streets and POIs do not (people rarely misspell a road and it keeps fuzzy bounded).
|
||||
fn is_fuzzy_eligible_type(place_type: &str) -> bool {
|
||||
!matches!(
|
||||
place_type,
|
||||
"street" | "park" | "attraction" | "hospital" | "retail"
|
||||
)
|
||||
}
|
||||
|
||||
/// Jaccard similarity between two sorted trigram sets (0.0–1.0).
|
||||
pub fn trigram_similarity(left: &[u32], right: &[u32]) -> f32 {
|
||||
if left.is_empty() || right.is_empty() {
|
||||
return 0.0;
|
||||
}
|
||||
let (mut i, mut j, mut intersection) = (0, 0, 0usize);
|
||||
while i < left.len() && j < right.len() {
|
||||
match left[i].cmp(&right[j]) {
|
||||
std::cmp::Ordering::Less => i += 1,
|
||||
std::cmp::Ordering::Greater => j += 1,
|
||||
std::cmp::Ordering::Equal => {
|
||||
intersection += 1;
|
||||
i += 1;
|
||||
j += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
let union = left.len() + right.len() - intersection;
|
||||
intersection as f32 / union as f32
|
||||
}
|
||||
|
||||
fn replace_token(text: &str, from: &str, to: &str) -> Option<String> {
|
||||
let mut changed = false;
|
||||
let replaced: Vec<&str> = text
|
||||
|
|
@ -191,15 +346,31 @@ fn push_alias(aliases: &mut Vec<String>, alias: String) {
|
|||
}
|
||||
}
|
||||
|
||||
/// Bidirectional token abbreviations expanded into search aliases so a query typed either
|
||||
/// way matches (e.g. "gt missenden" ↔ "Great Missenden", "mt" ↔ "Mount").
|
||||
const PLACE_TOKEN_ALIASES: &[(&str, &str)] = &[
|
||||
("st", "saint"),
|
||||
("saint", "st"),
|
||||
("mt", "mount"),
|
||||
("mount", "mt"),
|
||||
("gt", "great"),
|
||||
("great", "gt"),
|
||||
("lt", "little"),
|
||||
("little", "lt"),
|
||||
("upr", "upper"),
|
||||
("upper", "upr"),
|
||||
("lwr", "lower"),
|
||||
("lower", "lwr"),
|
||||
];
|
||||
|
||||
fn build_search_text(name: &str, place_type: &str) -> String {
|
||||
let primary = normalize_search_text(name);
|
||||
let mut aliases = vec![primary.clone()];
|
||||
|
||||
if let Some(alias) = replace_token(&primary, "st", "saint") {
|
||||
push_alias(&mut aliases, alias);
|
||||
}
|
||||
if let Some(alias) = replace_token(&primary, "saint", "st") {
|
||||
push_alias(&mut aliases, alias);
|
||||
for (from, to) in PLACE_TOKEN_ALIASES {
|
||||
if let Some(alias) = replace_token(&primary, from, to) {
|
||||
push_alias(&mut aliases, alias);
|
||||
}
|
||||
}
|
||||
|
||||
if place_type == "station" {
|
||||
|
|
@ -391,6 +562,26 @@ impl PlaceData {
|
|||
fallback_city
|
||||
};
|
||||
|
||||
// Build the place search index: an inverted token index over all rows (so the per-query
|
||||
// cost scales with matched candidates, not the ~1M-row corpus), plus a trigram index over
|
||||
// only fuzzy-eligible rows for bounded typo matching.
|
||||
let mut token_index: FxHashMap<String, Vec<u32>> = FxHashMap::default();
|
||||
let mut fuzzy_trigram_index: FxHashMap<u32, Vec<u32>> = FxHashMap::default();
|
||||
for idx in 0..row_count {
|
||||
for token in place_index_tokens(&name_search[idx]) {
|
||||
token_index.entry(token).or_default().push(idx as u32);
|
||||
}
|
||||
if is_fuzzy_eligible_type(&place_type_raw[idx]) {
|
||||
for trigram in compute_trigrams(&name[idx]) {
|
||||
fuzzy_trigram_index
|
||||
.entry(trigram)
|
||||
.or_default()
|
||||
.push(idx as u32);
|
||||
}
|
||||
}
|
||||
}
|
||||
let token_prefix_index = build_place_prefix_index(&token_index);
|
||||
|
||||
let with_pop = population.iter().filter(|&&pop| pop > 0).count();
|
||||
let with_city = city.iter().filter(|c| c.is_some()).count();
|
||||
info!(
|
||||
|
|
@ -398,6 +589,8 @@ impl PlaceData {
|
|||
types = place_type.values.len(),
|
||||
with_population = with_pop,
|
||||
with_city = with_city,
|
||||
tokens = token_index.len(),
|
||||
fuzzy_trigrams = fuzzy_trigram_index.len(),
|
||||
"Place data loaded"
|
||||
);
|
||||
|
||||
|
|
@ -412,14 +605,261 @@ impl PlaceData {
|
|||
lon,
|
||||
city,
|
||||
travel_destination,
|
||||
token_index,
|
||||
token_prefix_index,
|
||||
fuzzy_trigram_index,
|
||||
})
|
||||
}
|
||||
|
||||
/// Candidate place rows for the query content tokens: intersect the posting lists of words
|
||||
/// typed in full; if none matched an indexed token exactly, seed from the smallest
|
||||
/// prefix-expanded list (so a partially-typed final word still works). Bounded by
|
||||
/// `PLACE_CANDIDATE_LIMIT`.
|
||||
pub fn place_candidate_rows(&self, tokens: &[&str]) -> Vec<u32> {
|
||||
let mut exact: Vec<&[u32]> = tokens
|
||||
.iter()
|
||||
.filter_map(|token| self.token_index.get(*token).map(Vec::as_slice))
|
||||
.collect();
|
||||
|
||||
let mut rows = if exact.is_empty() {
|
||||
self.place_prefix_seed(tokens)
|
||||
} else {
|
||||
exact.sort_by_key(|posting| posting.len());
|
||||
let mut acc = exact[0].to_vec();
|
||||
for posting in &exact[1..] {
|
||||
if acc.is_empty() {
|
||||
break;
|
||||
}
|
||||
acc = intersect_sorted(&acc, posting);
|
||||
}
|
||||
acc
|
||||
};
|
||||
rows.truncate(PLACE_CANDIDATE_LIMIT);
|
||||
rows
|
||||
}
|
||||
|
||||
fn place_prefix_seed(&self, tokens: &[&str]) -> Vec<u32> {
|
||||
let mut best: Option<Vec<u32>> = None;
|
||||
for token in tokens {
|
||||
if token.len() < PLACE_PREFIX_MIN_LEN {
|
||||
continue;
|
||||
}
|
||||
let key = &token[..token.len().min(PLACE_PREFIX_MAX_LEN)];
|
||||
let Some(indexed) = self.token_prefix_index.get(key) else {
|
||||
continue;
|
||||
};
|
||||
let mut union: Vec<u32> = Vec::new();
|
||||
for indexed_token in indexed {
|
||||
if !indexed_token.starts_with(token) {
|
||||
continue;
|
||||
}
|
||||
if let Some(rows) = self.token_index.get(indexed_token) {
|
||||
union = if union.is_empty() {
|
||||
rows.clone()
|
||||
} else {
|
||||
union_sorted(&union, rows)
|
||||
};
|
||||
}
|
||||
}
|
||||
if !union.is_empty()
|
||||
&& best
|
||||
.as_ref()
|
||||
.is_none_or(|current| union.len() < current.len())
|
||||
{
|
||||
best = Some(union);
|
||||
}
|
||||
}
|
||||
best.unwrap_or_default()
|
||||
}
|
||||
|
||||
/// Fuzzy-eligible rows sharing enough trigrams with the query to be worth Jaccard scoring.
|
||||
/// Bounded by the (small) fuzzy trigram index rather than scanning every place.
|
||||
pub fn fuzzy_candidate_rows(&self, query_trigrams: &[u32]) -> Vec<u32> {
|
||||
if query_trigrams.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
let mut counts: FxHashMap<u32, u16> = FxHashMap::default();
|
||||
for trigram in query_trigrams {
|
||||
if let Some(rows) = self.fuzzy_trigram_index.get(trigram) {
|
||||
for &row in rows {
|
||||
*counts.entry(row).or_default() += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
let min_shared = (((query_trigrams.len() as f32) * 0.4).ceil() as u16).max(1);
|
||||
counts
|
||||
.into_iter()
|
||||
.filter_map(|(row, shared)| (shared >= min_shared).then_some(row))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl PlaceData {
|
||||
/// Build a minimal PlaceData from (name, place_type) pairs for index tests.
|
||||
fn from_names<S: AsRef<str>>(rows: &[(S, S)]) -> Self {
|
||||
let name: Vec<String> = rows.iter().map(|(nm, _)| nm.as_ref().to_string()).collect();
|
||||
let place_type_raw: Vec<String> =
|
||||
rows.iter().map(|(_, pt)| pt.as_ref().to_string()).collect();
|
||||
let name_lower: Vec<String> = name.iter().map(|nm| nm.to_lowercase()).collect();
|
||||
let name_search: Vec<String> = name
|
||||
.iter()
|
||||
.zip(&place_type_raw)
|
||||
.map(|(nm, pt)| build_search_text(nm, pt))
|
||||
.collect();
|
||||
let mut token_index: FxHashMap<String, Vec<u32>> = FxHashMap::default();
|
||||
let mut fuzzy_trigram_index: FxHashMap<u32, Vec<u32>> = FxHashMap::default();
|
||||
for idx in 0..name.len() {
|
||||
for token in place_index_tokens(&name_search[idx]) {
|
||||
token_index.entry(token).or_default().push(idx as u32);
|
||||
}
|
||||
if is_fuzzy_eligible_type(&place_type_raw[idx]) {
|
||||
for trigram in compute_trigrams(&name[idx]) {
|
||||
fuzzy_trigram_index
|
||||
.entry(trigram)
|
||||
.or_default()
|
||||
.push(idx as u32);
|
||||
}
|
||||
}
|
||||
}
|
||||
let token_prefix_index = build_place_prefix_index(&token_index);
|
||||
let len = name.len();
|
||||
PlaceData {
|
||||
name,
|
||||
name_lower,
|
||||
name_search,
|
||||
place_type: InternedColumn::build(&place_type_raw),
|
||||
type_rank: place_type_raw.iter().map(|pt| type_rank(pt)).collect(),
|
||||
population: vec![0; len],
|
||||
lat: vec![0.0; len],
|
||||
lon: vec![0.0; len],
|
||||
city: vec![None; len],
|
||||
travel_destination: vec![false; len],
|
||||
token_index,
|
||||
token_prefix_index,
|
||||
fuzzy_trigram_index,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn place_index_tokens_dedup_and_min_length() {
|
||||
// "a" is too short; aliases split on " | ".
|
||||
assert_eq!(
|
||||
place_index_tokens("st albans | saint albans"),
|
||||
vec!["albans".to_string(), "saint".to_string(), "st".to_string()]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn place_candidate_rows_intersect_and_prefix_seed() {
|
||||
let pd = PlaceData::from_names(&[
|
||||
("Camden", "suburb"),
|
||||
("Camden Town", "suburb"),
|
||||
("Camden Market", "attraction"),
|
||||
("Manchester", "city"),
|
||||
("Manchester Piccadilly", "station"),
|
||||
]);
|
||||
|
||||
// Full word → posting list (Camden, Camden Town, Camden Market).
|
||||
let camden = pd.place_candidate_rows(&["camden"]);
|
||||
assert_eq!(camden, vec![0, 1, 2]);
|
||||
|
||||
// Two full words intersect to rows containing BOTH (Camden Town only).
|
||||
let camden_town = pd.place_candidate_rows(&["camden", "town"]);
|
||||
assert_eq!(camden_town, vec![1]);
|
||||
|
||||
// A partially-typed final word with no exact token seeds from the prefix index.
|
||||
let piccad = pd.place_candidate_rows(&["piccad"]);
|
||||
assert_eq!(piccad, vec![4]);
|
||||
|
||||
// No match → empty.
|
||||
assert!(pd.place_candidate_rows(&["zzzz"]).is_empty());
|
||||
}
|
||||
|
||||
// Run with: cargo test --release bench_place_search -- --ignored --nocapture
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn bench_place_search_at_one_million_rows() {
|
||||
let roads = [
|
||||
"High Street",
|
||||
"Station Road",
|
||||
"Church Lane",
|
||||
"Victoria Road",
|
||||
"Mill Lane",
|
||||
"Park Avenue",
|
||||
"Queens Road",
|
||||
"Kings Road",
|
||||
];
|
||||
let mut rows: Vec<(String, String)> = Vec::with_capacity(1_000_000);
|
||||
for i in 0..1_000_000usize {
|
||||
// Vary the name so the index resembles ~1M distinct (street, area) rows.
|
||||
rows.push((
|
||||
format!("{} {}", roads[i % roads.len()], i % 4000),
|
||||
"street".into(),
|
||||
));
|
||||
}
|
||||
rows.push(("London".into(), "city".into()));
|
||||
let pd = PlaceData::from_names(&rows);
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
let mut hits = 0usize;
|
||||
for _ in 0..50 {
|
||||
let candidates = pd.place_candidate_rows(&["high", "street"]);
|
||||
for row in candidates {
|
||||
let idx = row as usize;
|
||||
if place_search_test_score(&pd, idx, "high street", &["high", "street"]).is_some() {
|
||||
hits += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
let per_query = start.elapsed() / 50;
|
||||
println!(
|
||||
"indexed place search over {} rows: {:?}/query ({} hits)",
|
||||
pd.name.len(),
|
||||
per_query,
|
||||
hits / 50
|
||||
);
|
||||
// The old full O(N) scan measured ~36ms here; candidate-based must be far under that.
|
||||
assert!(per_query.as_millis() < 10, "per_query was {per_query:?}");
|
||||
}
|
||||
|
||||
/// Mirrors the route's per-candidate match check for the bench.
|
||||
fn place_search_test_score(
|
||||
pd: &PlaceData,
|
||||
idx: usize,
|
||||
query_search: &str,
|
||||
query_tokens: &[&str],
|
||||
) -> Option<f32> {
|
||||
let search_text = &pd.name_search[idx];
|
||||
if query_tokens.iter().all(|qt| {
|
||||
place_alias_tokens(search_text)
|
||||
.any(|t| t == *qt || (qt.len() >= 2 && t.starts_with(qt)))
|
||||
}) {
|
||||
Some(640.0)
|
||||
} else if pd.name_lower[idx] == query_search {
|
||||
Some(1000.0)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fuzzy_candidate_rows_finds_typos_only_for_eligible_rows() {
|
||||
let pd = PlaceData::from_names(&[
|
||||
("London", "city"),
|
||||
("Baker Street", "street"), // not fuzzy-eligible
|
||||
]);
|
||||
let typo = compute_trigrams("Londn");
|
||||
let candidates = pd.fuzzy_candidate_rows(&typo);
|
||||
assert!(candidates.contains(&0)); // London (city) is reachable by fuzzy
|
||||
assert!(!candidates.contains(&1)); // streets are excluded from the fuzzy index
|
||||
}
|
||||
|
||||
fn test_city_rows() -> [(&'static str, f32, f32, u32); 5] {
|
||||
[
|
||||
("London", 51.507_446, -0.1277653, 8_908_083),
|
||||
|
|
@ -470,6 +910,29 @@ mod tests {
|
|||
assert!(build_search_text("Shadwell DLR station", "station").contains("shadwell station"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn search_text_expands_directional_and_size_abbreviations() {
|
||||
assert!(build_search_text("Great Missenden", "village").contains("gt missenden"));
|
||||
assert!(build_search_text("Mount Pleasant", "suburb").contains("mt pleasant"));
|
||||
assert!(build_search_text("Little Venice", "suburb").contains("lt venice"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn trigram_similarity_is_high_for_typos_and_low_for_unrelated() {
|
||||
let london = compute_trigrams("London");
|
||||
let typo = compute_trigrams("Londn");
|
||||
let other = compute_trigrams("Manchester");
|
||||
assert!(trigram_similarity(&london, &typo) >= 0.4);
|
||||
assert!(trigram_similarity(&london, &other) < 0.2);
|
||||
assert!((trigram_similarity(&london, &london) - 1.0).abs() < 1e-6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn place_alias_tokens_split_across_aliases() {
|
||||
let tokens: Vec<&str> = place_alias_tokens("kings cross | kings x").collect();
|
||||
assert_eq!(tokens, vec!["kings", "cross", "kings", "x"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn travel_destination_types_match_legacy_places() {
|
||||
assert!(is_travel_destination_type("city"));
|
||||
|
|
|
|||
|
|
@ -398,7 +398,7 @@ fn build_school_meta(
|
|||
|
||||
let mut idx = vec![u32::MAX; row_count];
|
||||
let mut meta = Vec::new();
|
||||
for row in 0..row_count {
|
||||
for (row, meta_idx) in idx.iter_mut().enumerate().take(row_count) {
|
||||
let type_group_val = fetch_str(&type_group, row);
|
||||
let type_val = fetch_str(&r#type, row);
|
||||
// type_group is present for every GIAS row, so use it as the sentinel
|
||||
|
|
@ -406,7 +406,7 @@ fn build_school_meta(
|
|||
if type_group_val.is_none() && type_val.is_none() {
|
||||
continue;
|
||||
}
|
||||
idx[row] = meta.len() as u32;
|
||||
*meta_idx = meta.len() as u32;
|
||||
meta.push(SchoolMetadata {
|
||||
phase: fetch_str(&phase, row),
|
||||
r#type: type_val,
|
||||
|
|
|
|||
|
|
@ -10,8 +10,10 @@ use rustc_hash::{FxHashMap, FxHashSet};
|
|||
use crate::consts::{H3_PRECOMPUTE_MAX, HISTOGRAM_BINS, NAN_U16, QUANT_SCALE};
|
||||
use crate::features::{self, Bounds};
|
||||
|
||||
const ADDRESS_SEARCH_CANDIDATE_LIMIT: usize = 50_000;
|
||||
const ADDRESS_SEARCH_MAX_POSTINGS_PER_TOKEN: usize = 250_000;
|
||||
/// Upper bound on rows scored per query. Intersection keeps most candidate sets far below
|
||||
/// this; only a single very common road word (e.g. "high") approaches it, and the in-area
|
||||
/// priority sort keeps a refined query's matches ahead of the cut.
|
||||
const ADDRESS_SEARCH_CANDIDATE_LIMIT: usize = 150_000;
|
||||
const ADDRESS_SEARCH_PREFIX_MIN_LEN: usize = 4;
|
||||
const ADDRESS_SEARCH_PREFIX_MAX_LEN: usize = 8;
|
||||
const NO_POI_METRIC_ROW: u32 = u32::MAX;
|
||||
|
|
@ -162,6 +164,11 @@ struct AddressTermGroup {
|
|||
#[derive(Debug)]
|
||||
struct AddressQuery {
|
||||
full_postcode: Option<String>,
|
||||
/// Compact uppercase outward code (optionally with a sector digit) recovered when the
|
||||
/// user appended a partial postcode like "NW1" or "NW1 6". Used as an additive ranking
|
||||
/// bias, never as a hard filter — so the disambiguating hint is honoured without
|
||||
/// excluding the same road in other areas.
|
||||
postcode_area: Option<String>,
|
||||
text_groups: Vec<AddressTermGroup>,
|
||||
numeric_terms: Vec<String>,
|
||||
candidate_terms: Vec<String>,
|
||||
|
|
@ -442,6 +449,138 @@ fn build_address_prefix_index(
|
|||
prefix_index
|
||||
}
|
||||
|
||||
/// Intersect two ascending-sorted row-id slices.
|
||||
fn intersect_sorted(left: &[u32], right: &[u32]) -> Vec<u32> {
|
||||
let mut out = Vec::new();
|
||||
let (mut i, mut j) = (0, 0);
|
||||
while i < left.len() && j < right.len() {
|
||||
match left[i].cmp(&right[j]) {
|
||||
std::cmp::Ordering::Less => i += 1,
|
||||
std::cmp::Ordering::Greater => j += 1,
|
||||
std::cmp::Ordering::Equal => {
|
||||
out.push(left[i]);
|
||||
i += 1;
|
||||
j += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// Union two ascending-sorted row-id slices (deduplicated, stays sorted).
|
||||
fn union_sorted(left: &[u32], right: &[u32]) -> Vec<u32> {
|
||||
let mut out = Vec::with_capacity(left.len() + right.len());
|
||||
let (mut i, mut j) = (0, 0);
|
||||
while i < left.len() && j < right.len() {
|
||||
match left[i].cmp(&right[j]) {
|
||||
std::cmp::Ordering::Less => {
|
||||
out.push(left[i]);
|
||||
i += 1;
|
||||
}
|
||||
std::cmp::Ordering::Greater => {
|
||||
out.push(right[j]);
|
||||
j += 1;
|
||||
}
|
||||
std::cmp::Ordering::Equal => {
|
||||
out.push(left[i]);
|
||||
i += 1;
|
||||
j += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
out.extend_from_slice(&left[i..]);
|
||||
out.extend_from_slice(&right[j..]);
|
||||
out
|
||||
}
|
||||
|
||||
/// An ordinal like "1st", "2nd", "3rd", "21st" — part of the street name ("2nd Avenue"), not a
|
||||
/// house-number prefix.
|
||||
fn is_ordinal_token(token: &str) -> bool {
|
||||
let split = token.len().saturating_sub(2);
|
||||
let (digits, suffix) = token.split_at(split);
|
||||
!digits.is_empty()
|
||||
&& digits.chars().all(|ch| ch.is_ascii_digit())
|
||||
&& matches!(suffix, "st" | "nd" | "rd" | "th")
|
||||
}
|
||||
|
||||
/// Leading address tokens that denote a unit/house number rather than the street itself.
|
||||
fn is_house_prefix_token(token: &str) -> bool {
|
||||
if is_ordinal_token(token) {
|
||||
return false;
|
||||
}
|
||||
matches!(
|
||||
token,
|
||||
"flat" | "fl" | "apartment" | "apt" | "unit" | "no" | "block" | "floor" | "room"
|
||||
) || token.len() == 1
|
||||
|| token.chars().all(|ch| ch.is_ascii_digit())
|
||||
|| (token.chars().next().is_some_and(|ch| ch.is_ascii_digit())
|
||||
&& token.chars().any(|ch| ch.is_ascii_alphabetic()))
|
||||
}
|
||||
|
||||
/// Street-level key for an address: drops the leading house-number / flat prefix so that
|
||||
/// "12 Baker Street" and "5 Baker Street" collapse to a single street entry.
|
||||
fn street_key(address: &str) -> String {
|
||||
let tokens = tokenize_address_text(address);
|
||||
let mut start = 0;
|
||||
while start < tokens.len() && is_house_prefix_token(&tokens[start]) {
|
||||
start += 1;
|
||||
}
|
||||
if start >= tokens.len() {
|
||||
return tokens.join(" ");
|
||||
}
|
||||
tokens[start..].join(" ")
|
||||
}
|
||||
|
||||
/// Road-type words. Their presence (with no house number) marks a road browse, which we
|
||||
/// collapse to one result per street.
|
||||
const ROAD_TYPE_TOKENS: &[&str] = &[
|
||||
"street",
|
||||
"st",
|
||||
"road",
|
||||
"rd",
|
||||
"lane",
|
||||
"ln",
|
||||
"avenue",
|
||||
"ave",
|
||||
"close",
|
||||
"cl",
|
||||
"drive",
|
||||
"dr",
|
||||
"way",
|
||||
"court",
|
||||
"ct",
|
||||
"crescent",
|
||||
"cres",
|
||||
"place",
|
||||
"terrace",
|
||||
"terr",
|
||||
"grove",
|
||||
"gardens",
|
||||
"gdns",
|
||||
"walk",
|
||||
"row",
|
||||
"square",
|
||||
"sq",
|
||||
"hill",
|
||||
"parade",
|
||||
"mews",
|
||||
"embankment",
|
||||
"broadway",
|
||||
"boulevard",
|
||||
"blvd",
|
||||
];
|
||||
|
||||
fn query_has_road_type(query: &str) -> bool {
|
||||
tokenize_address_text(query)
|
||||
.iter()
|
||||
.any(|token| ROAD_TYPE_TOKENS.contains(&token.as_str()))
|
||||
}
|
||||
|
||||
/// The outward code (everything before the space) of a canonical postcode.
|
||||
fn outcode_of(postcode: &str) -> &str {
|
||||
postcode.split(' ').next().unwrap_or(postcode)
|
||||
}
|
||||
|
||||
fn parse_address_query(query: &str) -> AddressQuery {
|
||||
let tokens = tokenize_address_text(query);
|
||||
let (full_postcode, postcode_token_indices) = extract_full_postcode(&tokens)
|
||||
|
|
@ -449,12 +588,45 @@ fn parse_address_query(query: &str) -> AddressQuery {
|
|||
.unwrap_or((None, Vec::new()));
|
||||
|
||||
let skip_postcode_tokens: FxHashSet<usize> = postcode_token_indices.into_iter().collect();
|
||||
|
||||
// Recover an appended partial postcode (outcode, or outcode + sector digit) as a ranking
|
||||
// bias rather than discarding it — but only from the TRAILING position, so a leading road
|
||||
// designation like "A4 Great West Road" is not mistaken for an area refinement.
|
||||
let mut postcode_area: Option<String> = None;
|
||||
let mut consumed_partial_tokens: FxHashSet<usize> = FxHashSet::default();
|
||||
if full_postcode.is_none() && !tokens.is_empty() {
|
||||
let last = tokens.len() - 1;
|
||||
if !skip_postcode_tokens.contains(&last) {
|
||||
let sector_digit =
|
||||
tokens[last].len() == 1 && tokens[last].chars().all(|ch| ch.is_ascii_digit());
|
||||
if last >= 1
|
||||
&& sector_digit
|
||||
&& !skip_postcode_tokens.contains(&(last - 1))
|
||||
&& looks_like_postcode_fragment(&tokens[last - 1])
|
||||
{
|
||||
postcode_area = Some(format!(
|
||||
"{}{}",
|
||||
tokens[last - 1].to_ascii_uppercase(),
|
||||
tokens[last]
|
||||
));
|
||||
consumed_partial_tokens.insert(last);
|
||||
consumed_partial_tokens.insert(last - 1);
|
||||
} else if looks_like_postcode_fragment(&tokens[last]) {
|
||||
postcode_area = Some(tokens[last].to_ascii_uppercase());
|
||||
consumed_partial_tokens.insert(last);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut text_groups = Vec::new();
|
||||
let mut numeric_terms = Vec::new();
|
||||
let mut candidate_terms = Vec::new();
|
||||
|
||||
for (idx, token) in tokens.iter().enumerate() {
|
||||
if skip_postcode_tokens.contains(&idx) || looks_like_postcode_fragment(token) {
|
||||
if skip_postcode_tokens.contains(&idx)
|
||||
|| consumed_partial_tokens.contains(&idx)
|
||||
|| looks_like_postcode_fragment(token)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -486,6 +658,7 @@ fn parse_address_query(query: &str) -> AddressQuery {
|
|||
|
||||
AddressQuery {
|
||||
full_postcode,
|
||||
postcode_area,
|
||||
text_groups,
|
||||
numeric_terms,
|
||||
candidate_terms,
|
||||
|
|
@ -897,9 +1070,15 @@ impl PropertyData {
|
|||
&self.address_search_token_keys[offset..offset + length]
|
||||
}
|
||||
|
||||
/// Search individual property addresses. Full postcode queries use a direct row index;
|
||||
/// free-text queries use a small inverted index over distinctive address tokens.
|
||||
pub fn search_addresses(&self, query: &str, limit: usize) -> Vec<usize> {
|
||||
/// Search individual property addresses, returning `(row, score)` ranked best-first.
|
||||
///
|
||||
/// Candidate rows come from intersecting the posting lists of the distinctive words the
|
||||
/// user typed in full (so "Cherry Hinton Road" narrows to rows containing both), unioned
|
||||
/// with the exact-postcode rows when a complete postcode is present (so a postcode is a
|
||||
/// boost, not an all-or-nothing gate). An appended partial postcode keeps in-area rows
|
||||
/// ahead of the candidate cut and adds a scoring bias. With a road-type word and no house
|
||||
/// number, results collapse to one row per street.
|
||||
pub fn search_addresses(&self, query: &str, limit: usize) -> Vec<(usize, i32)> {
|
||||
if limit == 0 {
|
||||
return Vec::new();
|
||||
}
|
||||
|
|
@ -912,25 +1091,45 @@ impl PropertyData {
|
|||
return Vec::new();
|
||||
}
|
||||
|
||||
let candidate_rows: Vec<u32> = if let Some(postcode) = parsed.full_postcode.as_deref() {
|
||||
self.postcode_interner
|
||||
let mut candidate_rows = self.address_candidate_rows(&parsed.candidate_terms);
|
||||
|
||||
// A complete postcode contributes its rows too, instead of replacing the road match.
|
||||
if let Some(postcode) = parsed.full_postcode.as_deref() {
|
||||
if let Some(rows) = self
|
||||
.postcode_interner
|
||||
.get(postcode)
|
||||
.and_then(|key| self.postcode_row_index.get(&key))
|
||||
.map(|rows| rows.to_vec())
|
||||
.unwrap_or_default()
|
||||
} else if let Some(rows) = self.best_address_token_rows(&parsed.candidate_terms) {
|
||||
rows.iter()
|
||||
.take(ADDRESS_SEARCH_CANDIDATE_LIMIT)
|
||||
.copied()
|
||||
.collect()
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
{
|
||||
candidate_rows = if candidate_rows.is_empty() {
|
||||
rows.clone()
|
||||
} else {
|
||||
union_sorted(&candidate_rows, rows)
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
if candidate_rows.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
// When the user appended a partial postcode, keep in-area rows ahead of the cut so the
|
||||
// refinement still surfaces even for very common roads. Single pass (stable partition) so
|
||||
// the postcode check — which allocates — runs exactly once per candidate.
|
||||
if let Some(area) = parsed.postcode_area.as_deref() {
|
||||
let mut in_area = Vec::new();
|
||||
let mut others = Vec::new();
|
||||
for &row in &candidate_rows {
|
||||
if self.row_postcode_in_area(row as usize, area) {
|
||||
in_area.push(row);
|
||||
} else {
|
||||
others.push(row);
|
||||
}
|
||||
}
|
||||
in_area.extend(others);
|
||||
candidate_rows = in_area;
|
||||
}
|
||||
candidate_rows.truncate(ADDRESS_SEARCH_CANDIDATE_LIMIT);
|
||||
|
||||
let mut scored: Vec<(i32, usize, usize)> = candidate_rows
|
||||
.into_iter()
|
||||
.filter_map(|row| {
|
||||
|
|
@ -948,18 +1147,29 @@ impl PropertyData {
|
|||
.then(left.2.cmp(&right.2))
|
||||
});
|
||||
|
||||
// Collapse a road browse (road-type word, no house number) to one row per street.
|
||||
let collapse_streets = parsed.numeric_terms.is_empty() && query_has_road_type(query);
|
||||
|
||||
let mut seen = FxHashSet::default();
|
||||
let mut results = Vec::with_capacity(limit);
|
||||
for (_, _, row) in scored {
|
||||
for (score, _, row) in scored {
|
||||
let address = self.address(row).trim();
|
||||
if address.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let key = format!("{}\n{}", address.to_ascii_lowercase(), self.postcode(row));
|
||||
let key = if collapse_streets {
|
||||
format!(
|
||||
"{}\n{}",
|
||||
street_key(address),
|
||||
outcode_of(self.postcode(row))
|
||||
)
|
||||
} else {
|
||||
format!("{}\n{}", address.to_ascii_lowercase(), self.postcode(row))
|
||||
};
|
||||
if !seen.insert(key) {
|
||||
continue;
|
||||
}
|
||||
results.push(row);
|
||||
results.push((row, score));
|
||||
if results.len() == limit {
|
||||
break;
|
||||
}
|
||||
|
|
@ -968,36 +1178,75 @@ impl PropertyData {
|
|||
results
|
||||
}
|
||||
|
||||
fn best_address_token_rows(&self, terms: &[String]) -> Option<&[u32]> {
|
||||
let mut best: Option<&[u32]> = None;
|
||||
|
||||
for term in terms {
|
||||
if let Some(rows) = self.address_token_index.get(term) {
|
||||
if best.is_none_or(|current| rows.len() < current.len()) {
|
||||
best = Some(rows.as_slice());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if term.len() < 4 {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(tokens) = self.address_prefix_index.get(address_prefix_key(term)) {
|
||||
for token in tokens {
|
||||
if !token.starts_with(term) {
|
||||
continue;
|
||||
}
|
||||
if let Some(rows) = self.address_token_index.get(token) {
|
||||
if best.is_none_or(|current| rows.len() < current.len()) {
|
||||
best = Some(rows.as_slice());
|
||||
}
|
||||
}
|
||||
}
|
||||
/// True when the row's postcode begins with the compact partial-postcode `area`
|
||||
/// (e.g. "NW1" or "NW16" matches "NW1 6XE").
|
||||
fn row_postcode_in_area(&self, row: usize, area: &str) -> bool {
|
||||
let mut compact = String::new();
|
||||
for ch in self.postcode(row).chars() {
|
||||
if !ch.is_whitespace() {
|
||||
compact.push(ch.to_ascii_uppercase());
|
||||
}
|
||||
}
|
||||
compact.starts_with(area)
|
||||
}
|
||||
|
||||
best
|
||||
/// Candidate rows for the distinctive query words. Words typed in full intersect by their
|
||||
/// exact posting lists (precise); a still-being-typed final word with no exact match seeds
|
||||
/// from the smallest prefix-expanded posting list (so partial typing keeps working).
|
||||
fn address_candidate_rows(&self, terms: &[String]) -> Vec<u32> {
|
||||
let mut exact: Vec<&[u32]> = terms
|
||||
.iter()
|
||||
.filter_map(|term| self.address_token_index.get(term).map(Vec::as_slice))
|
||||
.collect();
|
||||
|
||||
if !exact.is_empty() {
|
||||
exact.sort_by_key(|rows| rows.len());
|
||||
let mut acc = exact[0].to_vec();
|
||||
for rows in &exact[1..] {
|
||||
if acc.is_empty() {
|
||||
break;
|
||||
}
|
||||
acc = intersect_sorted(&acc, rows);
|
||||
}
|
||||
return acc;
|
||||
}
|
||||
|
||||
self.prefix_seed_rows(terms)
|
||||
}
|
||||
|
||||
/// Seed rows from the smallest prefix-expanded term — used only when no word matched an
|
||||
/// indexed token exactly (i.e. the user is still typing the final word).
|
||||
fn prefix_seed_rows(&self, terms: &[String]) -> Vec<u32> {
|
||||
let mut best: Option<Vec<u32>> = None;
|
||||
for term in terms {
|
||||
if term.len() < ADDRESS_SEARCH_PREFIX_MIN_LEN {
|
||||
continue;
|
||||
}
|
||||
let Some(tokens) = self.address_prefix_index.get(address_prefix_key(term)) else {
|
||||
continue;
|
||||
};
|
||||
let mut union: Vec<u32> = Vec::new();
|
||||
for token in tokens {
|
||||
if !token.starts_with(term) {
|
||||
continue;
|
||||
}
|
||||
if let Some(rows) = self.address_token_index.get(token) {
|
||||
union = if union.is_empty() {
|
||||
rows.clone()
|
||||
} else {
|
||||
union_sorted(&union, rows)
|
||||
};
|
||||
}
|
||||
}
|
||||
if !union.is_empty()
|
||||
&& best
|
||||
.as_ref()
|
||||
.is_none_or(|current| union.len() < current.len())
|
||||
{
|
||||
best = Some(union);
|
||||
}
|
||||
}
|
||||
best.unwrap_or_default()
|
||||
}
|
||||
|
||||
fn address_match_score(&self, row: usize, parsed: &AddressQuery) -> Option<i32> {
|
||||
|
|
@ -1037,6 +1286,12 @@ impl PropertyData {
|
|||
if numeric_matches == parsed.numeric_terms.len() && numeric_matches > 0 {
|
||||
score += 50;
|
||||
}
|
||||
// Additive bias (never a filter) when the row sits in the appended partial postcode.
|
||||
if let Some(area) = parsed.postcode_area.as_deref() {
|
||||
if self.row_postcode_in_area(row, area) {
|
||||
score += 400;
|
||||
}
|
||||
}
|
||||
|
||||
Some(score)
|
||||
}
|
||||
|
|
@ -1969,16 +2224,23 @@ impl PropertyData {
|
|||
}
|
||||
}
|
||||
}
|
||||
let address_token_count_before_prune = address_token_index.len();
|
||||
address_token_index.retain(|_, rows| rows.len() <= ADDRESS_SEARCH_MAX_POSTINGS_PER_TOKEN);
|
||||
// Keep every distinctive token: common road words ("high", "church", "station") are
|
||||
// exactly what people search, and dropping them made those roads unsearchable while a
|
||||
// prefix fallback surfaced the wrong street ("Highbury" for "High"). The candidate scan
|
||||
// is bounded per query instead (ADDRESS_SEARCH_CANDIDATE_LIMIT), and stop words are
|
||||
// already excluded from the index, so the largest posting lists stay modest.
|
||||
let max_postings = address_token_index
|
||||
.values()
|
||||
.map(Vec::len)
|
||||
.max()
|
||||
.unwrap_or(0);
|
||||
let address_prefix_index = build_address_prefix_index(&address_token_index);
|
||||
let address_search_interner = address_search_rodeo.into_reader();
|
||||
let address_postings_count: usize = address_token_index.values().map(Vec::len).sum();
|
||||
tracing::info!(
|
||||
tokens = address_token_index.len(),
|
||||
prefixes = address_prefix_index.len(),
|
||||
pruned_tokens =
|
||||
address_token_count_before_prune.saturating_sub(address_token_index.len()),
|
||||
max_postings_per_token = max_postings,
|
||||
postings = address_postings_count,
|
||||
row_tokens = address_search_token_keys.len(),
|
||||
"Address search index built"
|
||||
|
|
@ -2340,6 +2602,79 @@ mod tests {
|
|||
assert_eq!(parsed.candidate_terms, vec!["downing".to_string()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn address_query_recovers_appended_partial_postcode_as_bias() {
|
||||
let parsed = parse_address_query("Baker Street NW1");
|
||||
assert_eq!(parsed.full_postcode, None);
|
||||
assert_eq!(parsed.postcode_area.as_deref(), Some("NW1"));
|
||||
// The road words are still searchable; the postcode fragment did not consume them.
|
||||
assert_eq!(parsed.candidate_terms, vec!["baker".to_string()]);
|
||||
assert!(parsed.numeric_terms.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn address_query_recovers_outcode_plus_sector_without_a_phantom_house_number() {
|
||||
let parsed = parse_address_query("High Street CR0 2");
|
||||
assert_eq!(parsed.postcode_area.as_deref(), Some("CR02"));
|
||||
// The lone sector digit must not be treated as a house number.
|
||||
assert!(parsed.numeric_terms.is_empty());
|
||||
assert_eq!(parsed.candidate_terms, vec!["high".to_string()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn full_postcode_takes_precedence_over_partial_bias() {
|
||||
let parsed = parse_address_query("Baker Street NW1 6XE");
|
||||
assert_eq!(parsed.full_postcode.as_deref(), Some("NW1 6XE"));
|
||||
assert_eq!(parsed.postcode_area, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn intersect_and_union_sorted_row_ids() {
|
||||
assert_eq!(
|
||||
intersect_sorted(&[1, 2, 3, 5], &[2, 3, 4, 5]),
|
||||
vec![2, 3, 5]
|
||||
);
|
||||
assert_eq!(intersect_sorted(&[1, 2], &[3, 4]), Vec::<u32>::new());
|
||||
assert_eq!(union_sorted(&[1, 3, 5], &[2, 3, 4]), vec![1, 2, 3, 4, 5]);
|
||||
assert_eq!(union_sorted(&[], &[2, 4]), vec![2, 4]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn street_key_collapses_house_numbers_and_flats() {
|
||||
assert_eq!(street_key("12 Baker Street"), "baker street");
|
||||
assert_eq!(street_key("5 Baker Street"), "baker street");
|
||||
assert_eq!(street_key("Flat 2, 10 Downing Street"), "downing street");
|
||||
assert_eq!(street_key("221B Baker Street"), "baker street");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn street_key_keeps_ordinal_street_names() {
|
||||
// Ordinals are part of the street name, not a house-number prefix.
|
||||
assert_eq!(street_key("2nd Avenue"), "2nd avenue");
|
||||
assert_eq!(street_key("12 3rd Avenue"), "3rd avenue");
|
||||
assert!(is_ordinal_token("21st"));
|
||||
assert!(!is_ordinal_token("21"));
|
||||
assert!(!is_ordinal_token("221b"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn postcode_area_recovered_only_from_the_trailing_position() {
|
||||
// A leading road designation must NOT be taken as an area refinement.
|
||||
let parsed = parse_address_query("A4 Great West Road");
|
||||
assert_eq!(parsed.postcode_area, None);
|
||||
// A genuine trailing outcode still is.
|
||||
let trailing = parse_address_query("Great West Road W4");
|
||||
assert_eq!(trailing.postcode_area.as_deref(), Some("W4"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn road_type_detection() {
|
||||
assert!(query_has_road_type("high street"));
|
||||
assert!(query_has_road_type("acacia avenue"));
|
||||
assert!(!query_has_road_type("acacia"));
|
||||
assert!(!query_has_road_type("london"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn address_query_parsing_keeps_partial_terms_for_row_matching() {
|
||||
let parsed = parse_address_query("settlers cour");
|
||||
|
|
|
|||
|
|
@ -507,8 +507,7 @@ async fn main() -> anyhow::Result<()> {
|
|||
"property_borders.pmtiles",
|
||||
);
|
||||
|
||||
let noise_overlay_reader =
|
||||
init_required_tile_reader("Noise", &noise_overlay_tiles).await?;
|
||||
let noise_overlay_reader = init_required_tile_reader("Noise", &noise_overlay_tiles).await?;
|
||||
let satellite_reader = init_required_tile_reader("Satellite", &satellite_tiles).await?;
|
||||
let satellite_highres_reader =
|
||||
init_required_tile_reader("Satellite high-res", &satellite_highres_tiles).await?;
|
||||
|
|
|
|||
|
|
@ -2,14 +2,26 @@ use std::sync::Arc;
|
|||
|
||||
use axum::extract::{Query, State};
|
||||
use axum::response::Json;
|
||||
use rustc_hash::FxHashSet;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::info;
|
||||
|
||||
use crate::api_error::ApiError;
|
||||
use crate::consts::PLACES_LIMIT;
|
||||
use crate::data::{normalize_search_text, slugify};
|
||||
use crate::data::{
|
||||
compute_trigrams, normalize_search_text, place_alias_tokens, slugify, trigram_similarity,
|
||||
};
|
||||
use crate::state::SharedState;
|
||||
|
||||
/// Trailing connective words dropped from a place query so "fish and chips" matches a place
|
||||
/// stored (after `&` is normalized away) as "fish chips".
|
||||
const QUERY_STOP_WORDS: &[&str] = &["and", "the", "of"];
|
||||
|
||||
/// Minimum trigram similarity for a fuzzy place match.
|
||||
const FUZZY_MIN_SIMILARITY: f32 = 0.42;
|
||||
/// Run the (linear) fuzzy pass only when the exact passes found fewer than this.
|
||||
const FUZZY_TRIGGER_BELOW: usize = 3;
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct PlaceResult {
|
||||
name: String,
|
||||
|
|
@ -29,6 +41,43 @@ pub struct AddressResult {
|
|||
lon: f32,
|
||||
}
|
||||
|
||||
/// A single, category-tagged, relevance-scored result. The frontend renders these in order,
|
||||
/// so ranking is unified across places, outcodes, postcodes and addresses instead of the old
|
||||
/// fixed positional bucketing.
|
||||
#[derive(Serialize)]
|
||||
#[serde(tag = "type", rename_all = "lowercase")]
|
||||
pub enum UnifiedResult {
|
||||
Place {
|
||||
name: String,
|
||||
slug: String,
|
||||
place_type: String,
|
||||
lat: f32,
|
||||
lon: f32,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
city: Option<String>,
|
||||
score: f32,
|
||||
},
|
||||
Postcode {
|
||||
label: String,
|
||||
score: f32,
|
||||
},
|
||||
Address {
|
||||
address: String,
|
||||
postcode: String,
|
||||
lat: f32,
|
||||
lon: f32,
|
||||
score: f32,
|
||||
},
|
||||
}
|
||||
|
||||
fn unified_score(result: &UnifiedResult) -> f32 {
|
||||
match result {
|
||||
UnifiedResult::Place { score, .. }
|
||||
| UnifiedResult::Postcode { score, .. }
|
||||
| UnifiedResult::Address { score, .. } => *score,
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct PlacesResponse {
|
||||
places: Vec<PlaceResult>,
|
||||
|
|
@ -36,6 +85,9 @@ pub struct PlacesResponse {
|
|||
postcodes: Vec<String>,
|
||||
#[serde(skip_serializing_if = "Vec::is_empty")]
|
||||
addresses: Vec<AddressResult>,
|
||||
/// Unified, relevance-ordered results. Preferred by the frontend; the arrays above remain
|
||||
/// for backward compatibility.
|
||||
results: Vec<UnifiedResult>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
|
|
@ -44,6 +96,9 @@ pub struct PlacesParams {
|
|||
q: String,
|
||||
/// If set, only return places that have travel time data for this mode.
|
||||
mode: Option<String>,
|
||||
/// Optional map-viewport centre used to bias ranking toward what the user is looking at.
|
||||
lat: Option<f32>,
|
||||
lng: Option<f32>,
|
||||
}
|
||||
|
||||
fn compact_postcode_query(query: &str) -> String {
|
||||
|
|
@ -93,6 +148,131 @@ fn postcode_starts_with_compact(postcode: &str, compact_query: &str) -> bool {
|
|||
current.is_none()
|
||||
}
|
||||
|
||||
fn is_postcode_fragmentish(token: &str) -> bool {
|
||||
(2..=4).contains(&token.len())
|
||||
&& token
|
||||
.chars()
|
||||
.next()
|
||||
.is_some_and(|ch| ch.is_ascii_alphabetic())
|
||||
&& token.chars().any(|ch| ch.is_ascii_digit())
|
||||
&& token.chars().all(|ch| ch.is_ascii_alphanumeric())
|
||||
}
|
||||
|
||||
/// Peel a trailing geographic refinement (outcode, or outcode + sector digit) off the query.
|
||||
/// "camden nw1" → ("camden", Some("NW1")); the core matches the place, the refinement biases
|
||||
/// ranking and drives the outcode/postcode lists — instead of breaking the match entirely.
|
||||
fn split_geographic_refinement(query: &str) -> (String, Option<String>) {
|
||||
let words: Vec<&str> = query.split_whitespace().collect();
|
||||
if words.len() < 2 {
|
||||
return (query.to_string(), None);
|
||||
}
|
||||
let last = words[words.len() - 1];
|
||||
if words.len() >= 3 && last.len() == 1 && last.chars().all(|ch| ch.is_ascii_digit()) {
|
||||
let prev = words[words.len() - 2];
|
||||
if is_postcode_fragmentish(prev) {
|
||||
let area = format!("{}{}", prev.to_ascii_uppercase(), last);
|
||||
return (words[..words.len() - 2].join(" "), Some(area));
|
||||
}
|
||||
}
|
||||
if is_postcode_fragmentish(last) {
|
||||
return (
|
||||
words[..words.len() - 1].join(" "),
|
||||
Some(last.to_ascii_uppercase()),
|
||||
);
|
||||
}
|
||||
(query.to_string(), None)
|
||||
}
|
||||
|
||||
/// Content words of a place query, dropping connectives so "fish and chips" matches "Fish & Chips".
|
||||
fn query_content_tokens(query_search: &str) -> Vec<&str> {
|
||||
query_search
|
||||
.split(' ')
|
||||
.filter(|token| !token.is_empty() && !QUERY_STOP_WORDS.contains(token))
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Base relevance tier for a place, or None if it does not match at all.
|
||||
fn place_base_score(
|
||||
search_text: &str,
|
||||
name_lower: &str,
|
||||
query_search: &str,
|
||||
query_lower: &str,
|
||||
query_tokens: &[&str],
|
||||
) -> Option<f32> {
|
||||
if query_search.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut exact = name_lower == query_lower;
|
||||
let mut prefix = name_lower.starts_with(query_lower);
|
||||
for alias in search_text.split(" | ") {
|
||||
if alias == query_search {
|
||||
exact = true;
|
||||
}
|
||||
if alias.starts_with(query_search) {
|
||||
prefix = true;
|
||||
}
|
||||
}
|
||||
if exact {
|
||||
return Some(1000.0);
|
||||
}
|
||||
if prefix {
|
||||
return Some(820.0);
|
||||
}
|
||||
|
||||
if !query_tokens.is_empty() {
|
||||
let all_covered = query_tokens.iter().all(|query_token| {
|
||||
place_alias_tokens(search_text).any(|token| {
|
||||
token == *query_token || (query_token.len() >= 2 && token.starts_with(query_token))
|
||||
})
|
||||
});
|
||||
if all_covered {
|
||||
return Some(640.0);
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Small additive bonuses: more important place types and bigger populations rank higher.
|
||||
fn place_modifiers(type_rank: u8, population: u32) -> f32 {
|
||||
let type_bonus = f32::from(6u8.saturating_sub(type_rank)) * 8.0;
|
||||
let pop_bonus = (population as f32 + 1.0).ln() * 4.0;
|
||||
type_bonus + pop_bonus.min(64.0)
|
||||
}
|
||||
|
||||
/// Distance-decay bonus toward the viewport / refinement centre. Capped below the gap between
|
||||
/// match tiers so it reorders within a tier and breaks ties without overriding exact matches.
|
||||
fn proximity_bonus(center: Option<(f32, f32)>, lat: f32, lon: f32) -> f32 {
|
||||
let Some((center_lat, center_lon)) = center else {
|
||||
return 0.0;
|
||||
};
|
||||
let dlat = lat - center_lat;
|
||||
let dlon = (lon - center_lon) * center_lat.to_radians().cos();
|
||||
let dist = (dlat * dlat + dlon * dlon).sqrt();
|
||||
160.0 * (-dist / 0.3).exp()
|
||||
}
|
||||
|
||||
/// Map an address match's raw specificity score onto the unified scale.
|
||||
fn address_unified_score(raw: i32) -> f32 {
|
||||
460.0 + raw.min(1000) as f32 * 0.47
|
||||
}
|
||||
|
||||
/// Resolve the outcode a compact partial postcode sits in (e.g. "NW16" → "nw1"), trying
|
||||
/// progressively shorter prefixes against the known outcode set. Returns its index.
|
||||
fn resolve_outcode_idx(name_lower: &[String], area: &str) -> Option<usize> {
|
||||
let area_lower = area.to_lowercase();
|
||||
let mut len = area_lower.len();
|
||||
while len >= 2 {
|
||||
let candidate = &area_lower[..len];
|
||||
if let Some(idx) = name_lower.iter().position(|name| name == candidate) {
|
||||
return Some(idx);
|
||||
}
|
||||
len -= 1;
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub async fn get_places(
|
||||
State(shared): State<Arc<SharedState>>,
|
||||
Query(params): Query<PlacesParams>,
|
||||
|
|
@ -106,154 +286,229 @@ pub async fn get_places(
|
|||
|
||||
let limit = PLACES_LIMIT;
|
||||
let mode_filter = params.mode;
|
||||
let viewport = match (params.lat, params.lng) {
|
||||
(Some(lat), Some(lng)) => Some((lat, lng)),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
let places = tokio::task::spawn_blocking(move || {
|
||||
let response = tokio::task::spawn_blocking(move || {
|
||||
let t0 = std::time::Instant::now();
|
||||
let query_lower = query.to_lowercase();
|
||||
let query_search = normalize_search_text(&query);
|
||||
let pd = &state.place_data;
|
||||
let od = &state.outcode_data;
|
||||
let postcode_data = &state.postcode_data;
|
||||
let tt_store = &state.travel_time_store;
|
||||
let property_data = &state.data;
|
||||
|
||||
// Linear scan — ~50-100k rows, <1ms
|
||||
// Tuple: (row_idx, is_exact, is_prefix, type_rank, population, name_len, slug)
|
||||
let mut matches: Vec<(usize, bool, bool, u8, u32, usize, String)> = pd
|
||||
.name_search
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(idx, search_text)| {
|
||||
if query_search.is_empty() || !search_text.contains(&query_search) {
|
||||
return None;
|
||||
}
|
||||
let slug = slugify(&pd.name[idx]);
|
||||
// Peel any appended outcode/partial-postcode so the place text matches on the core
|
||||
// words while the refinement biases ranking and drives the outcode/postcode lists.
|
||||
let (split_query, refinement) = split_geographic_refinement(&query);
|
||||
// Only honour the refinement when it resolves to a real outcode; otherwise (e.g. "the o2",
|
||||
// where "o2" looks postcode-ish but is not an outcode) treat the whole query as place text.
|
||||
let refinement_outcode = refinement
|
||||
.as_deref()
|
||||
.and_then(|area| resolve_outcode_idx(&od.name_lower, area));
|
||||
let place_query = if refinement.is_some() && refinement_outcode.is_none() {
|
||||
query.clone()
|
||||
} else {
|
||||
split_query
|
||||
};
|
||||
let query_search = normalize_search_text(&place_query);
|
||||
let query_lower = place_query.to_lowercase();
|
||||
let query_tokens = query_content_tokens(&query_search);
|
||||
|
||||
// If mode filter is set, keep the historical travel destination set only.
|
||||
if let Some(ref mode) = mode_filter {
|
||||
if !pd.travel_destination[idx] || !tt_store.has_destination(mode, &slug) {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
// Bias centre: explicit viewport, else the resolved refinement outcode's centroid.
|
||||
let bias_center = viewport.or_else(|| refinement_outcode.map(|idx| od.centroids[idx]));
|
||||
|
||||
let is_exact = search_text
|
||||
.split(" | ")
|
||||
.any(|alias| alias == query_search || pd.name_lower[idx] == query_lower);
|
||||
let is_prefix = search_text
|
||||
.split(" | ")
|
||||
.any(|alias| alias.starts_with(&query_search))
|
||||
|| pd.name_lower[idx].starts_with(&query_lower);
|
||||
Some((
|
||||
idx,
|
||||
is_exact,
|
||||
is_prefix,
|
||||
pd.type_rank[idx],
|
||||
pd.population[idx],
|
||||
pd.name[idx].len(),
|
||||
slug,
|
||||
))
|
||||
// ---- Places: candidate rows from the inverted token index, then exact/prefix/token-AND
|
||||
// scoring — bounded by matched candidates, not the ~1M-row corpus. Fuzzy fallback uses the
|
||||
// (small) trigram index over fuzzy-eligible rows only.
|
||||
let mut place_results: Vec<(f32, PlaceResult)> = Vec::new();
|
||||
let mut matched_place_idx: FxHashSet<usize> = FxHashSet::default();
|
||||
let make_place = |idx: usize| PlaceResult {
|
||||
name: pd.name[idx].clone(),
|
||||
slug: slugify(&pd.name[idx]),
|
||||
place_type: pd.place_type.get(idx).to_string(),
|
||||
lat: pd.lat[idx],
|
||||
lon: pd.lon[idx],
|
||||
city: pd.city[idx].clone(),
|
||||
};
|
||||
let passes_mode = |idx: usize| {
|
||||
mode_filter.as_ref().is_none_or(|mode| {
|
||||
pd.travel_destination[idx]
|
||||
&& tt_store.has_destination(mode, &slugify(&pd.name[idx]))
|
||||
})
|
||||
.collect();
|
||||
};
|
||||
|
||||
// Sort: exact first, then prefix, then type rank asc, then population desc, then name length asc
|
||||
matches.sort_unstable_by(|lhs, rhs| {
|
||||
rhs.1
|
||||
.cmp(&lhs.1)
|
||||
.then(rhs.2.cmp(&lhs.2))
|
||||
.then(lhs.3.cmp(&rhs.3))
|
||||
.then(rhs.4.cmp(&lhs.4))
|
||||
.then(lhs.5.cmp(&rhs.5))
|
||||
});
|
||||
for row in pd.place_candidate_rows(&query_tokens) {
|
||||
let idx = row as usize;
|
||||
let Some(base) = place_base_score(
|
||||
&pd.name_search[idx],
|
||||
&pd.name_lower[idx],
|
||||
&query_search,
|
||||
&query_lower,
|
||||
&query_tokens,
|
||||
) else {
|
||||
continue;
|
||||
};
|
||||
if !passes_mode(idx) {
|
||||
continue;
|
||||
}
|
||||
let score = base
|
||||
+ place_modifiers(pd.type_rank[idx], pd.population[idx])
|
||||
+ proximity_bonus(bias_center, pd.lat[idx], pd.lon[idx]);
|
||||
matched_place_idx.insert(idx);
|
||||
place_results.push((score, make_place(idx)));
|
||||
}
|
||||
|
||||
matches.truncate(limit);
|
||||
// Fuzzy (trigram) fallback only when the exact passes were thin and the query is long
|
||||
// enough to be discriminating.
|
||||
if place_results.len() < FUZZY_TRIGGER_BELOW && query_search.len() >= 4 {
|
||||
let query_trigrams = compute_trigrams(&place_query);
|
||||
for row in pd.fuzzy_candidate_rows(&query_trigrams) {
|
||||
let idx = row as usize;
|
||||
if matched_place_idx.contains(&idx) || !passes_mode(idx) {
|
||||
continue;
|
||||
}
|
||||
let similarity =
|
||||
trigram_similarity(&query_trigrams, &compute_trigrams(&pd.name[idx]));
|
||||
if similarity < FUZZY_MIN_SIMILARITY {
|
||||
continue;
|
||||
}
|
||||
let score = 280.0
|
||||
+ similarity * 120.0
|
||||
+ place_modifiers(pd.type_rank[idx], pd.population[idx])
|
||||
+ proximity_bonus(bias_center, pd.lat[idx], pd.lon[idx]);
|
||||
matched_place_idx.insert(idx);
|
||||
place_results.push((score, make_place(idx)));
|
||||
}
|
||||
}
|
||||
|
||||
let mut results: Vec<PlaceResult> = matches
|
||||
.iter()
|
||||
.map(|(idx, .., slug)| PlaceResult {
|
||||
name: pd.name[*idx].clone(),
|
||||
slug: slug.clone(),
|
||||
place_type: pd.place_type.get(*idx).to_string(),
|
||||
lat: pd.lat[*idx],
|
||||
lon: pd.lon[*idx],
|
||||
city: pd.city[*idx].clone(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Also search outcodes (skip when mode filter is set — outcodes aren't travel destinations)
|
||||
if mode_filter.is_none() {
|
||||
let query_upper = query_lower.to_uppercase();
|
||||
let mut outcode_results: Vec<PlaceResult> = od
|
||||
.name_lower
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(idx, name)| {
|
||||
if !name.starts_with(&query_lower) {
|
||||
return None;
|
||||
}
|
||||
let is_exact = name.len() == query_lower.len();
|
||||
Some((idx, is_exact))
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.into_iter()
|
||||
.map(|(idx, _is_exact)| PlaceResult {
|
||||
// ---- Outcodes (skipped under a mode filter) ----
|
||||
let push_outcode = |results: &mut Vec<(f32, PlaceResult)>, idx: usize, base: f32| {
|
||||
let (clat, clon) = od.centroids[idx];
|
||||
results.push((
|
||||
base + proximity_bonus(bias_center, clat, clon),
|
||||
PlaceResult {
|
||||
name: od.names[idx].clone(),
|
||||
slug: od.names[idx].to_lowercase(),
|
||||
place_type: "outcode".to_string(),
|
||||
lat: od.centroids[idx].0,
|
||||
lon: od.centroids[idx].1,
|
||||
lat: clat,
|
||||
lon: clon,
|
||||
city: od.cities[idx].clone(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort outcodes: exact first, then by name length (shorter = broader area)
|
||||
outcode_results.sort_unstable_by(|a, b| {
|
||||
let a_exact = a.name.eq_ignore_ascii_case(&query_upper);
|
||||
let b_exact = b.name.eq_ignore_ascii_case(&query_upper);
|
||||
b_exact.cmp(&a_exact).then(a.name.len().cmp(&b.name.len()))
|
||||
});
|
||||
|
||||
// Prepend outcode results (up to 3) before place results, keeping total ≤ limit
|
||||
outcode_results.truncate(3);
|
||||
let place_slots = limit.saturating_sub(outcode_results.len());
|
||||
results.truncate(place_slots);
|
||||
outcode_results.append(&mut results);
|
||||
results = outcode_results;
|
||||
},
|
||||
));
|
||||
};
|
||||
if mode_filter.is_none() {
|
||||
if let Some(idx) = refinement_outcode {
|
||||
// A refinement ("camden nw1") resolves to exactly one outcode — no NW10/NW11 noise.
|
||||
push_outcode(&mut place_results, idx, 980.0);
|
||||
} else if looks_like_postcode_prefix(&query) {
|
||||
// A bare postcode-prefix query ("e1") lists matching outcodes (e1, e10, e11, ...).
|
||||
let area_lower = compact_postcode_query(&query).to_lowercase();
|
||||
for idx in 0..od.names.len() {
|
||||
let name = &od.name_lower[idx];
|
||||
let is_exact = *name == area_lower;
|
||||
if !(name.starts_with(&area_lower) || area_lower.starts_with(name.as_str())) {
|
||||
continue;
|
||||
}
|
||||
push_outcode(
|
||||
&mut place_results,
|
||||
idx,
|
||||
if is_exact { 980.0 } else { 760.0 },
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let postcodes: Vec<String> = if mode_filter.is_none() && looks_like_postcode_prefix(&query)
|
||||
{
|
||||
let compact_query = compact_postcode_query(&query);
|
||||
postcode_data
|
||||
.postcodes
|
||||
.iter()
|
||||
.filter(|postcode| postcode_starts_with_compact(postcode, &compact_query))
|
||||
.filter(|postcode| !property_data.rows_for_postcode(postcode).is_empty())
|
||||
.take(limit)
|
||||
.cloned()
|
||||
.collect()
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
place_results.sort_by(|left, right| right.0.total_cmp(&left.0));
|
||||
place_results.truncate(limit);
|
||||
|
||||
let addresses: Vec<AddressResult> = if mode_filter.is_none() {
|
||||
property_data
|
||||
.search_addresses(&query, limit)
|
||||
.into_iter()
|
||||
.map(|row| AddressResult {
|
||||
address: property_data.address(row).trim().to_string(),
|
||||
postcode: property_data.postcode(row).to_string(),
|
||||
lat: property_data.lat[row],
|
||||
lon: property_data.lon[row],
|
||||
})
|
||||
.collect()
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
// ---- Postcodes (full-postcode prefix list) ----
|
||||
let mut postcode_results: Vec<(f32, String)> = Vec::new();
|
||||
if mode_filter.is_none() && looks_like_postcode_prefix(&query) {
|
||||
let compact_query = compact_postcode_query(&query);
|
||||
for postcode in &postcode_data.postcodes {
|
||||
if !postcode_starts_with_compact(postcode, &compact_query) {
|
||||
continue;
|
||||
}
|
||||
if property_data.rows_for_postcode(postcode).is_empty() {
|
||||
continue;
|
||||
}
|
||||
let compact_pc: String =
|
||||
postcode.chars().filter(|ch| !ch.is_whitespace()).collect();
|
||||
let score = if compact_pc == compact_query {
|
||||
960.0
|
||||
} else {
|
||||
900.0
|
||||
};
|
||||
postcode_results.push((score, postcode.clone()));
|
||||
if postcode_results.len() >= limit {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
postcode_results.sort_by(|left, right| right.0.total_cmp(&left.0));
|
||||
|
||||
// ---- Addresses ----
|
||||
let mut address_results: Vec<(f32, AddressResult)> = Vec::new();
|
||||
if mode_filter.is_none() {
|
||||
for (row, raw) in property_data.search_addresses(&query, limit) {
|
||||
let lat = property_data.lat[row];
|
||||
let lon = property_data.lon[row];
|
||||
let score = address_unified_score(raw) + proximity_bonus(bias_center, lat, lon);
|
||||
address_results.push((
|
||||
score,
|
||||
AddressResult {
|
||||
address: property_data.address(row).trim().to_string(),
|
||||
postcode: property_data.postcode(row).to_string(),
|
||||
lat,
|
||||
lon,
|
||||
},
|
||||
));
|
||||
}
|
||||
}
|
||||
address_results.sort_by(|left, right| right.0.total_cmp(&left.0));
|
||||
|
||||
// ---- Unified merge: one relevance-ordered list across every source ----
|
||||
let mut unified: Vec<UnifiedResult> = Vec::new();
|
||||
for (score, place) in &place_results {
|
||||
unified.push(UnifiedResult::Place {
|
||||
name: place.name.clone(),
|
||||
slug: place.slug.clone(),
|
||||
place_type: place.place_type.clone(),
|
||||
lat: place.lat,
|
||||
lon: place.lon,
|
||||
city: place.city.clone(),
|
||||
score: *score,
|
||||
});
|
||||
}
|
||||
for (score, postcode) in &postcode_results {
|
||||
unified.push(UnifiedResult::Postcode {
|
||||
label: postcode.clone(),
|
||||
score: *score,
|
||||
});
|
||||
}
|
||||
for (score, address) in &address_results {
|
||||
unified.push(UnifiedResult::Address {
|
||||
address: address.address.clone(),
|
||||
postcode: address.postcode.clone(),
|
||||
lat: address.lat,
|
||||
lon: address.lon,
|
||||
score: *score,
|
||||
});
|
||||
}
|
||||
unified.sort_by(|left, right| unified_score(right).total_cmp(&unified_score(left)));
|
||||
unified.truncate(limit);
|
||||
|
||||
let places: Vec<PlaceResult> = place_results.into_iter().map(|(_, p)| p).collect();
|
||||
let postcodes: Vec<String> = postcode_results.into_iter().map(|(_, p)| p).collect();
|
||||
let addresses: Vec<AddressResult> = address_results.into_iter().map(|(_, a)| a).collect();
|
||||
|
||||
let elapsed = t0.elapsed();
|
||||
info!(
|
||||
query = query.as_str(),
|
||||
results = results.len(),
|
||||
results = unified.len(),
|
||||
places = places.len(),
|
||||
postcodes = postcodes.len(),
|
||||
addresses = addresses.len(),
|
||||
scanned = pd.name_lower.len(),
|
||||
|
|
@ -262,16 +517,17 @@ pub async fn get_places(
|
|||
"GET /api/places"
|
||||
);
|
||||
|
||||
(results, postcodes, addresses)
|
||||
PlacesResponse {
|
||||
places,
|
||||
postcodes,
|
||||
addresses,
|
||||
results: unified,
|
||||
}
|
||||
})
|
||||
.await
|
||||
.map_err(|error| ApiError::Internal(error.to_string()))?;
|
||||
|
||||
Ok(Json(PlacesResponse {
|
||||
places: places.0,
|
||||
postcodes: places.1,
|
||||
addresses: places.2,
|
||||
}))
|
||||
Ok(Json(response))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
@ -293,4 +549,88 @@ mod tests {
|
|||
assert!(postcode_starts_with_compact("SW1A 1AA", "SW1A1"));
|
||||
assert!(!postcode_starts_with_compact("SW1A 1AA", "SW1A2"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn refinement_splits_off_trailing_outcode() {
|
||||
assert_eq!(
|
||||
split_geographic_refinement("camden nw1"),
|
||||
("camden".to_string(), Some("NW1".to_string()))
|
||||
);
|
||||
assert_eq!(
|
||||
split_geographic_refinement("high street cr0 2"),
|
||||
("high street".to_string(), Some("CR02".to_string()))
|
||||
);
|
||||
// A bare outcode is not split (handled by the outcode/postcode path directly).
|
||||
assert_eq!(
|
||||
split_geographic_refinement("e14"),
|
||||
("e14".to_string(), None)
|
||||
);
|
||||
// No trailing postcode → unchanged.
|
||||
assert_eq!(
|
||||
split_geographic_refinement("baker street"),
|
||||
("baker street".to_string(), None)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn query_tokens_drop_connectives() {
|
||||
assert_eq!(
|
||||
query_content_tokens("fish and chips"),
|
||||
vec!["fish", "chips"]
|
||||
);
|
||||
assert_eq!(query_content_tokens("isle of dogs"), vec!["isle", "dogs"]);
|
||||
}
|
||||
|
||||
fn base(search: &str, query: &str) -> Option<f32> {
|
||||
let q = normalize_search_text(query);
|
||||
let tokens = query_content_tokens(&q);
|
||||
place_base_score(search, search, &q, &query.to_lowercase(), &tokens)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn place_match_tiers_order_exact_above_prefix_above_token_and() {
|
||||
let exact = base("camden", "camden").unwrap();
|
||||
let prefix = base("camden town", "camden").unwrap();
|
||||
let token_and = base("camden market", "market camden").unwrap();
|
||||
assert!(exact > prefix);
|
||||
assert!(prefix > token_and);
|
||||
// A reordered multi-word query still matches via token-AND.
|
||||
assert!(base("manchester piccadilly", "piccadilly manchester").is_some());
|
||||
// Pure infix substrings no longer match (candidates are token-based): "ford" must not
|
||||
// surface "Stratford" — that was the old population-dominated noise.
|
||||
assert!(base("stratford", "ford").is_none());
|
||||
// Appended noise that matches nothing yields no match (the route strips postcodes first).
|
||||
assert!(base("camden", "camden zzzz").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn address_full_postcode_outranks_an_outcode_prefix() {
|
||||
// raw 1200 ≈ road + full postcode + number; outcode prefix base is 760.
|
||||
assert!(address_unified_score(1200) > 760.0);
|
||||
// a road-only address (raw 200) ranks below an outcode prefix.
|
||||
assert!(address_unified_score(200) < 760.0);
|
||||
assert!(address_unified_score(1200) > address_unified_score(200));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn proximity_bonus_decays_and_never_flips_match_tiers() {
|
||||
let here = proximity_bonus(Some((51.5, -0.1)), 51.5, -0.1);
|
||||
let far = proximity_bonus(Some((51.5, -0.1)), 53.5, -2.0);
|
||||
assert!(here > far);
|
||||
assert!(here <= 160.0);
|
||||
// Smaller than the 180-pt gap between exact (1000) and prefix (820).
|
||||
assert!(here < 180.0);
|
||||
assert_eq!(proximity_bonus(None, 51.5, -0.1), 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_outcode_idx_handles_sectorised_area_and_unknown() {
|
||||
let names = vec!["nw1".to_string(), "e14".to_string()];
|
||||
// "NW16" → outcode NW1 (strips the sector digit); "E14" → exact.
|
||||
assert_eq!(resolve_outcode_idx(&names, "NW16"), Some(0));
|
||||
assert_eq!(resolve_outcode_idx(&names, "E14"), Some(1));
|
||||
// A postcode-ish token that is not a real outcode resolves to nothing (folds back).
|
||||
assert_eq!(resolve_outcode_idx(&names, "O2"), None);
|
||||
assert_eq!(resolve_outcode_idx(&names, "ZZ9"), None);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue