idk

2026-06-02 13:46:18 +01:00 · 2026-06-02 13:46:18 +01:00 · d43da9708c
commit d43da9708c
parent a04ac2d857
47 changed files with 4120 additions and 573 deletions
--- a/server-rs/src/data/crime_by_year.rs
+++ b/server-rs/src/data/crime_by_year.rs
@ -120,7 +120,7 @@ impl CrimeByYearData {
                .list()
                .with_context(|| format!("Column '{col_name}' is not a list"))?;

-            for row in 0..row_count {
+            for (row, postcode) in postcode_values.iter().enumerate().take(row_count) {
                let Some(inner) = list_ca.get_as_series(row) else {
                    continue;
                };
@ -163,7 +163,7 @@ impl CrimeByYearData {
                points.sort_by_key(|p| p.year);

                series_by_postcode
-                    .entry(postcode_values[row].clone())
+                    .entry(postcode.clone())
                    .or_default()
                    .push(PostcodeCrimeSeries {
                        type_idx: type_idx as u16,
--- a/server-rs/src/data/places.rs
+++ b/server-rs/src/data/places.rs
@ -4,10 +4,16 @@ use anyhow::Context;
 use polars::frame::DataFrame;
 use polars::lazy::frame::LazyFrame;
 use polars::prelude::*;
+use rustc_hash::FxHashMap;
 use tracing::info;

 use crate::utils::InternedColumn;

+/// Upper bound on place rows scored per query (candidate sets are normally far smaller).
+const PLACE_CANDIDATE_LIMIT: usize = 50_000;
+const PLACE_PREFIX_MIN_LEN: usize = 2;
+const PLACE_PREFIX_MAX_LEN: usize = 6;
+
 pub struct PlaceData {
    pub name: Vec<String>,
    pub name_lower: Vec<String>,
@ -19,6 +25,13 @@ pub struct PlaceData {
    pub lon: Vec<f32>,
    pub city: Vec<Option<String>>,
    pub travel_destination: Vec<bool>,
+    /// Inverted index from an alias token to the (ascending) place rows containing it. Lets place
+    /// search gather candidates instead of scanning all ~1M+ rows per keystroke.
+    token_index: FxHashMap<String, Vec<u32>>,
+    /// Prefix → indexed tokens, for matching a partially-typed final word.
+    token_prefix_index: FxHashMap<String, Vec<String>>,
+    /// Trigram → fuzzy-eligible rows (settlements/stations only), for bounded typo matching.
+    fuzzy_trigram_index: FxHashMap<u32, Vec<u32>>,
 }

 #[derive(Clone, Copy)]
@ -168,6 +181,148 @@ pub fn normalize_search_text(text: &str) -> String {
    result
 }

+/// Tokens across all of a place's search aliases (split on word and alias separators),
+/// for token-AND matching where every query word must prefix-match some place token.
+pub fn place_alias_tokens(search_text: &str) -> impl Iterator<Item = &str> {
+    search_text
+        .split([' ', '|'])
+        .filter(|token| !token.is_empty())
+}
+
+fn trigram_hash(first: char, second: char, third: char) -> u32 {
+    let mut hash = 2_166_136_261u32;
+    for ch in [first, second, third] {
+        hash = (hash ^ (ch as u32)).wrapping_mul(16_777_619);
+    }
+    hash
+}
+
+/// Sorted, de-duplicated padded character trigrams of `text`, for Jaccard fuzzy matching.
+pub fn compute_trigrams(text: &str) -> Vec<u32> {
+    let norm = normalize_search_text(text);
+    if norm.is_empty() {
+        return Vec::new();
+    }
+    let chars: Vec<char> = [' ', ' ']
+        .into_iter()
+        .chain(norm.chars())
+        .chain(std::iter::once(' '))
+        .collect();
+    let mut grams: Vec<u32> = chars
+        .windows(3)
+        .map(|window| trigram_hash(window[0], window[1], window[2]))
+        .collect();
+    grams.sort_unstable();
+    grams.dedup();
+    grams
+}
+
+/// Intersect two ascending-sorted row-id slices.
+fn intersect_sorted(left: &[u32], right: &[u32]) -> Vec<u32> {
+    let mut out = Vec::new();
+    let (mut i, mut j) = (0, 0);
+    while i < left.len() && j < right.len() {
+        match left[i].cmp(&right[j]) {
+            std::cmp::Ordering::Less => i += 1,
+            std::cmp::Ordering::Greater => j += 1,
+            std::cmp::Ordering::Equal => {
+                out.push(left[i]);
+                i += 1;
+                j += 1;
+            }
+        }
+    }
+    out
+}
+
+/// Union two ascending-sorted row-id slices (deduplicated, stays sorted).
+fn union_sorted(left: &[u32], right: &[u32]) -> Vec<u32> {
+    let mut out = Vec::with_capacity(left.len() + right.len());
+    let (mut i, mut j) = (0, 0);
+    while i < left.len() && j < right.len() {
+        match left[i].cmp(&right[j]) {
+            std::cmp::Ordering::Less => {
+                out.push(left[i]);
+                i += 1;
+            }
+            std::cmp::Ordering::Greater => {
+                out.push(right[j]);
+                j += 1;
+            }
+            std::cmp::Ordering::Equal => {
+                out.push(left[i]);
+                i += 1;
+                j += 1;
+            }
+        }
+    }
+    out.extend_from_slice(&left[i..]);
+    out.extend_from_slice(&right[j..]);
+    out
+}
+
+/// Distinct indexable tokens (len ≥ 2) across all of a place's search aliases. ASCII because
+/// `normalize_search_text` already dropped non-alphanumerics, so prefix byte-slicing is safe.
+fn place_index_tokens(search_text: &str) -> Vec<String> {
+    let mut tokens: Vec<String> = place_alias_tokens(search_text)
+        .filter(|token| token.len() >= 2)
+        .map(ToString::to_string)
+        .collect();
+    tokens.sort_unstable();
+    tokens.dedup();
+    tokens
+}
+
+fn build_place_prefix_index(
+    token_index: &FxHashMap<String, Vec<u32>>,
+) -> FxHashMap<String, Vec<String>> {
+    let mut prefix_index: FxHashMap<String, Vec<String>> = FxHashMap::default();
+    for token in token_index.keys() {
+        let max_len = token.len().min(PLACE_PREFIX_MAX_LEN);
+        for len in PLACE_PREFIX_MIN_LEN..=max_len {
+            prefix_index
+                .entry(token[..len].to_string())
+                .or_default()
+                .push(token.clone());
+        }
+    }
+    for tokens in prefix_index.values_mut() {
+        tokens.sort_unstable();
+        tokens.dedup();
+    }
+    prefix_index
+}
+
+/// Whether a place type participates in fuzzy (typo) matching. Settlements/stations/universities
+/// do; the ~1M streets and POIs do not (people rarely misspell a road and it keeps fuzzy bounded).
+fn is_fuzzy_eligible_type(place_type: &str) -> bool {
+    !matches!(
+        place_type,
+        "street" | "park" | "attraction" | "hospital" | "retail"
+    )
+}
+
+/// Jaccard similarity between two sorted trigram sets (0.0–1.0).
+pub fn trigram_similarity(left: &[u32], right: &[u32]) -> f32 {
+    if left.is_empty() || right.is_empty() {
+        return 0.0;
+    }
+    let (mut i, mut j, mut intersection) = (0, 0, 0usize);
+    while i < left.len() && j < right.len() {
+        match left[i].cmp(&right[j]) {
+            std::cmp::Ordering::Less => i += 1,
+            std::cmp::Ordering::Greater => j += 1,
+            std::cmp::Ordering::Equal => {
+                intersection += 1;
+                i += 1;
+                j += 1;
+            }
+        }
+    }
+    let union = left.len() + right.len() - intersection;
+    intersection as f32 / union as f32
+}
+
 fn replace_token(text: &str, from: &str, to: &str) -> Option<String> {
    let mut changed = false;
    let replaced: Vec<&str> = text
@ -191,15 +346,31 @@ fn push_alias(aliases: &mut Vec<String>, alias: String) {
    }
 }

+/// Bidirectional token abbreviations expanded into search aliases so a query typed either
+/// way matches (e.g. "gt missenden" ↔ "Great Missenden", "mt" ↔ "Mount").
+const PLACE_TOKEN_ALIASES: &[(&str, &str)] = &[
+    ("st", "saint"),
+    ("saint", "st"),
+    ("mt", "mount"),
+    ("mount", "mt"),
+    ("gt", "great"),
+    ("great", "gt"),
+    ("lt", "little"),
+    ("little", "lt"),
+    ("upr", "upper"),
+    ("upper", "upr"),
+    ("lwr", "lower"),
+    ("lower", "lwr"),
+];
+
 fn build_search_text(name: &str, place_type: &str) -> String {
    let primary = normalize_search_text(name);
    let mut aliases = vec![primary.clone()];

-    if let Some(alias) = replace_token(&primary, "st", "saint") {
-        push_alias(&mut aliases, alias);
-    }
-    if let Some(alias) = replace_token(&primary, "saint", "st") {
-        push_alias(&mut aliases, alias);
+    for (from, to) in PLACE_TOKEN_ALIASES {
+        if let Some(alias) = replace_token(&primary, from, to) {
+            push_alias(&mut aliases, alias);
+        }
    }

    if place_type == "station" {
@ -391,6 +562,26 @@ impl PlaceData {
            fallback_city
        };

+        // Build the place search index: an inverted token index over all rows (so the per-query
+        // cost scales with matched candidates, not the ~1M-row corpus), plus a trigram index over
+        // only fuzzy-eligible rows for bounded typo matching.
+        let mut token_index: FxHashMap<String, Vec<u32>> = FxHashMap::default();
+        let mut fuzzy_trigram_index: FxHashMap<u32, Vec<u32>> = FxHashMap::default();
+        for idx in 0..row_count {
+            for token in place_index_tokens(&name_search[idx]) {
+                token_index.entry(token).or_default().push(idx as u32);
+            }
+            if is_fuzzy_eligible_type(&place_type_raw[idx]) {
+                for trigram in compute_trigrams(&name[idx]) {
+                    fuzzy_trigram_index
+                        .entry(trigram)
+                        .or_default()
+                        .push(idx as u32);
+                }
+            }
+        }
+        let token_prefix_index = build_place_prefix_index(&token_index);
+
        let with_pop = population.iter().filter(|&&pop| pop > 0).count();
        let with_city = city.iter().filter(|c| c.is_some()).count();
        info!(
@ -398,6 +589,8 @@ impl PlaceData {
            types = place_type.values.len(),
            with_population = with_pop,
            with_city = with_city,
+            tokens = token_index.len(),
+            fuzzy_trigrams = fuzzy_trigram_index.len(),
            "Place data loaded"
        );

@ -412,14 +605,261 @@ impl PlaceData {
            lon,
            city,
            travel_destination,
+            token_index,
+            token_prefix_index,
+            fuzzy_trigram_index,
        })
    }
+
+    /// Candidate place rows for the query content tokens: intersect the posting lists of words
+    /// typed in full; if none matched an indexed token exactly, seed from the smallest
+    /// prefix-expanded list (so a partially-typed final word still works). Bounded by
+    /// `PLACE_CANDIDATE_LIMIT`.
+    pub fn place_candidate_rows(&self, tokens: &[&str]) -> Vec<u32> {
+        let mut exact: Vec<&[u32]> = tokens
+            .iter()
+            .filter_map(|token| self.token_index.get(*token).map(Vec::as_slice))
+            .collect();
+
+        let mut rows = if exact.is_empty() {
+            self.place_prefix_seed(tokens)
+        } else {
+            exact.sort_by_key(|posting| posting.len());
+            let mut acc = exact[0].to_vec();
+            for posting in &exact[1..] {
+                if acc.is_empty() {
+                    break;
+                }
+                acc = intersect_sorted(&acc, posting);
+            }
+            acc
+        };
+        rows.truncate(PLACE_CANDIDATE_LIMIT);
+        rows
+    }
+
+    fn place_prefix_seed(&self, tokens: &[&str]) -> Vec<u32> {
+        let mut best: Option<Vec<u32>> = None;
+        for token in tokens {
+            if token.len() < PLACE_PREFIX_MIN_LEN {
+                continue;
+            }
+            let key = &token[..token.len().min(PLACE_PREFIX_MAX_LEN)];
+            let Some(indexed) = self.token_prefix_index.get(key) else {
+                continue;
+            };
+            let mut union: Vec<u32> = Vec::new();
+            for indexed_token in indexed {
+                if !indexed_token.starts_with(token) {
+                    continue;
+                }
+                if let Some(rows) = self.token_index.get(indexed_token) {
+                    union = if union.is_empty() {
+                        rows.clone()
+                    } else {
+                        union_sorted(&union, rows)
+                    };
+                }
+            }
+            if !union.is_empty()
+                && best
+                    .as_ref()
+                    .is_none_or(|current| union.len() < current.len())
+            {
+                best = Some(union);
+            }
+        }
+        best.unwrap_or_default()
+    }
+
+    /// Fuzzy-eligible rows sharing enough trigrams with the query to be worth Jaccard scoring.
+    /// Bounded by the (small) fuzzy trigram index rather than scanning every place.
+    pub fn fuzzy_candidate_rows(&self, query_trigrams: &[u32]) -> Vec<u32> {
+        if query_trigrams.is_empty() {
+            return Vec::new();
+        }
+        let mut counts: FxHashMap<u32, u16> = FxHashMap::default();
+        for trigram in query_trigrams {
+            if let Some(rows) = self.fuzzy_trigram_index.get(trigram) {
+                for &row in rows {
+                    *counts.entry(row).or_default() += 1;
+                }
+            }
+        }
+        let min_shared = (((query_trigrams.len() as f32) * 0.4).ceil() as u16).max(1);
+        counts
+            .into_iter()
+            .filter_map(|(row, shared)| (shared >= min_shared).then_some(row))
+            .collect()
+    }
+}
+
+#[cfg(test)]
+impl PlaceData {
+    /// Build a minimal PlaceData from (name, place_type) pairs for index tests.
+    fn from_names<S: AsRef<str>>(rows: &[(S, S)]) -> Self {
+        let name: Vec<String> = rows.iter().map(|(nm, _)| nm.as_ref().to_string()).collect();
+        let place_type_raw: Vec<String> =
+            rows.iter().map(|(_, pt)| pt.as_ref().to_string()).collect();
+        let name_lower: Vec<String> = name.iter().map(|nm| nm.to_lowercase()).collect();
+        let name_search: Vec<String> = name
+            .iter()
+            .zip(&place_type_raw)
+            .map(|(nm, pt)| build_search_text(nm, pt))
+            .collect();
+        let mut token_index: FxHashMap<String, Vec<u32>> = FxHashMap::default();
+        let mut fuzzy_trigram_index: FxHashMap<u32, Vec<u32>> = FxHashMap::default();
+        for idx in 0..name.len() {
+            for token in place_index_tokens(&name_search[idx]) {
+                token_index.entry(token).or_default().push(idx as u32);
+            }
+            if is_fuzzy_eligible_type(&place_type_raw[idx]) {
+                for trigram in compute_trigrams(&name[idx]) {
+                    fuzzy_trigram_index
+                        .entry(trigram)
+                        .or_default()
+                        .push(idx as u32);
+                }
+            }
+        }
+        let token_prefix_index = build_place_prefix_index(&token_index);
+        let len = name.len();
+        PlaceData {
+            name,
+            name_lower,
+            name_search,
+            place_type: InternedColumn::build(&place_type_raw),
+            type_rank: place_type_raw.iter().map(|pt| type_rank(pt)).collect(),
+            population: vec![0; len],
+            lat: vec![0.0; len],
+            lon: vec![0.0; len],
+            city: vec![None; len],
+            travel_destination: vec![false; len],
+            token_index,
+            token_prefix_index,
+            fuzzy_trigram_index,
+        }
+    }
 }

 #[cfg(test)]
 mod tests {
    use super::*;

+    #[test]
+    fn place_index_tokens_dedup_and_min_length() {
+        // "a" is too short; aliases split on " | ".
+        assert_eq!(
+            place_index_tokens("st albans | saint albans"),
+            vec!["albans".to_string(), "saint".to_string(), "st".to_string()]
+        );
+    }
+
+    #[test]
+    fn place_candidate_rows_intersect_and_prefix_seed() {
+        let pd = PlaceData::from_names(&[
+            ("Camden", "suburb"),
+            ("Camden Town", "suburb"),
+            ("Camden Market", "attraction"),
+            ("Manchester", "city"),
+            ("Manchester Piccadilly", "station"),
+        ]);
+
+        // Full word → posting list (Camden, Camden Town, Camden Market).
+        let camden = pd.place_candidate_rows(&["camden"]);
+        assert_eq!(camden, vec![0, 1, 2]);
+
+        // Two full words intersect to rows containing BOTH (Camden Town only).
+        let camden_town = pd.place_candidate_rows(&["camden", "town"]);
+        assert_eq!(camden_town, vec![1]);
+
+        // A partially-typed final word with no exact token seeds from the prefix index.
+        let piccad = pd.place_candidate_rows(&["piccad"]);
+        assert_eq!(piccad, vec![4]);
+
+        // No match → empty.
+        assert!(pd.place_candidate_rows(&["zzzz"]).is_empty());
+    }
+
+    // Run with: cargo test --release bench_place_search -- --ignored --nocapture
+    #[test]
+    #[ignore]
+    fn bench_place_search_at_one_million_rows() {
+        let roads = [
+            "High Street",
+            "Station Road",
+            "Church Lane",
+            "Victoria Road",
+            "Mill Lane",
+            "Park Avenue",
+            "Queens Road",
+            "Kings Road",
+        ];
+        let mut rows: Vec<(String, String)> = Vec::with_capacity(1_000_000);
+        for i in 0..1_000_000usize {
+            // Vary the name so the index resembles ~1M distinct (street, area) rows.
+            rows.push((
+                format!("{} {}", roads[i % roads.len()], i % 4000),
+                "street".into(),
+            ));
+        }
+        rows.push(("London".into(), "city".into()));
+        let pd = PlaceData::from_names(&rows);
+
+        let start = std::time::Instant::now();
+        let mut hits = 0usize;
+        for _ in 0..50 {
+            let candidates = pd.place_candidate_rows(&["high", "street"]);
+            for row in candidates {
+                let idx = row as usize;
+                if place_search_test_score(&pd, idx, "high street", &["high", "street"]).is_some() {
+                    hits += 1;
+                }
+            }
+        }
+        let per_query = start.elapsed() / 50;
+        println!(
+            "indexed place search over {} rows: {:?}/query ({} hits)",
+            pd.name.len(),
+            per_query,
+            hits / 50
+        );
+        // The old full O(N) scan measured ~36ms here; candidate-based must be far under that.
+        assert!(per_query.as_millis() < 10, "per_query was {per_query:?}");
+    }
+
+    /// Mirrors the route's per-candidate match check for the bench.
+    fn place_search_test_score(
+        pd: &PlaceData,
+        idx: usize,
+        query_search: &str,
+        query_tokens: &[&str],
+    ) -> Option<f32> {
+        let search_text = &pd.name_search[idx];
+        if query_tokens.iter().all(|qt| {
+            place_alias_tokens(search_text)
+                .any(|t| t == *qt || (qt.len() >= 2 && t.starts_with(qt)))
+        }) {
+            Some(640.0)
+        } else if pd.name_lower[idx] == query_search {
+            Some(1000.0)
+        } else {
+            None
+        }
+    }
+
+    #[test]
+    fn fuzzy_candidate_rows_finds_typos_only_for_eligible_rows() {
+        let pd = PlaceData::from_names(&[
+            ("London", "city"),
+            ("Baker Street", "street"), // not fuzzy-eligible
+        ]);
+        let typo = compute_trigrams("Londn");
+        let candidates = pd.fuzzy_candidate_rows(&typo);
+        assert!(candidates.contains(&0)); // London (city) is reachable by fuzzy
+        assert!(!candidates.contains(&1)); // streets are excluded from the fuzzy index
+    }
+
    fn test_city_rows() -> [(&'static str, f32, f32, u32); 5] {
        [
            ("London", 51.507_446, -0.1277653, 8_908_083),
@ -470,6 +910,29 @@ mod tests {
        assert!(build_search_text("Shadwell DLR station", "station").contains("shadwell station"));
    }

+    #[test]
+    fn search_text_expands_directional_and_size_abbreviations() {
+        assert!(build_search_text("Great Missenden", "village").contains("gt missenden"));
+        assert!(build_search_text("Mount Pleasant", "suburb").contains("mt pleasant"));
+        assert!(build_search_text("Little Venice", "suburb").contains("lt venice"));
+    }
+
+    #[test]
+    fn trigram_similarity_is_high_for_typos_and_low_for_unrelated() {
+        let london = compute_trigrams("London");
+        let typo = compute_trigrams("Londn");
+        let other = compute_trigrams("Manchester");
+        assert!(trigram_similarity(&london, &typo) >= 0.4);
+        assert!(trigram_similarity(&london, &other) < 0.2);
+        assert!((trigram_similarity(&london, &london) - 1.0).abs() < 1e-6);
+    }
+
+    #[test]
+    fn place_alias_tokens_split_across_aliases() {
+        let tokens: Vec<&str> = place_alias_tokens("kings cross | kings x").collect();
+        assert_eq!(tokens, vec!["kings", "cross", "kings", "x"]);
+    }
+
    #[test]
    fn travel_destination_types_match_legacy_places() {
        assert!(is_travel_destination_type("city"));
--- a/server-rs/src/data/poi.rs
+++ b/server-rs/src/data/poi.rs
@ -398,7 +398,7 @@ fn build_school_meta(

    let mut idx = vec![u32::MAX; row_count];
    let mut meta = Vec::new();
-    for row in 0..row_count {
+    for (row, meta_idx) in idx.iter_mut().enumerate().take(row_count) {
        let type_group_val = fetch_str(&type_group, row);
        let type_val = fetch_str(&r#type, row);
        // type_group is present for every GIAS row, so use it as the sentinel
@ -406,7 +406,7 @@ fn build_school_meta(
        if type_group_val.is_none() && type_val.is_none() {
            continue;
        }
-        idx[row] = meta.len() as u32;
+        *meta_idx = meta.len() as u32;
        meta.push(SchoolMetadata {
            phase: fetch_str(&phase, row),
            r#type: type_val,
--- a/server-rs/src/data/property.rs
+++ b/server-rs/src/data/property.rs
@ -10,8 +10,10 @@ use rustc_hash::{FxHashMap, FxHashSet};
 use crate::consts::{H3_PRECOMPUTE_MAX, HISTOGRAM_BINS, NAN_U16, QUANT_SCALE};
 use crate::features::{self, Bounds};

-const ADDRESS_SEARCH_CANDIDATE_LIMIT: usize = 50_000;
-const ADDRESS_SEARCH_MAX_POSTINGS_PER_TOKEN: usize = 250_000;
+/// Upper bound on rows scored per query. Intersection keeps most candidate sets far below
+/// this; only a single very common road word (e.g. "high") approaches it, and the in-area
+/// priority sort keeps a refined query's matches ahead of the cut.
+const ADDRESS_SEARCH_CANDIDATE_LIMIT: usize = 150_000;
 const ADDRESS_SEARCH_PREFIX_MIN_LEN: usize = 4;
 const ADDRESS_SEARCH_PREFIX_MAX_LEN: usize = 8;
 const NO_POI_METRIC_ROW: u32 = u32::MAX;
@ -162,6 +164,11 @@ struct AddressTermGroup {
 #[derive(Debug)]
 struct AddressQuery {
    full_postcode: Option<String>,
+    /// Compact uppercase outward code (optionally with a sector digit) recovered when the
+    /// user appended a partial postcode like "NW1" or "NW1 6". Used as an additive ranking
+    /// bias, never as a hard filter — so the disambiguating hint is honoured without
+    /// excluding the same road in other areas.
+    postcode_area: Option<String>,
    text_groups: Vec<AddressTermGroup>,
    numeric_terms: Vec<String>,
    candidate_terms: Vec<String>,
@ -442,6 +449,138 @@ fn build_address_prefix_index(
    prefix_index
 }

+/// Intersect two ascending-sorted row-id slices.
+fn intersect_sorted(left: &[u32], right: &[u32]) -> Vec<u32> {
+    let mut out = Vec::new();
+    let (mut i, mut j) = (0, 0);
+    while i < left.len() && j < right.len() {
+        match left[i].cmp(&right[j]) {
+            std::cmp::Ordering::Less => i += 1,
+            std::cmp::Ordering::Greater => j += 1,
+            std::cmp::Ordering::Equal => {
+                out.push(left[i]);
+                i += 1;
+                j += 1;
+            }
+        }
+    }
+    out
+}
+
+/// Union two ascending-sorted row-id slices (deduplicated, stays sorted).
+fn union_sorted(left: &[u32], right: &[u32]) -> Vec<u32> {
+    let mut out = Vec::with_capacity(left.len() + right.len());
+    let (mut i, mut j) = (0, 0);
+    while i < left.len() && j < right.len() {
+        match left[i].cmp(&right[j]) {
+            std::cmp::Ordering::Less => {
+                out.push(left[i]);
+                i += 1;
+            }
+            std::cmp::Ordering::Greater => {
+                out.push(right[j]);
+                j += 1;
+            }
+            std::cmp::Ordering::Equal => {
+                out.push(left[i]);
+                i += 1;
+                j += 1;
+            }
+        }
+    }
+    out.extend_from_slice(&left[i..]);
+    out.extend_from_slice(&right[j..]);
+    out
+}
+
+/// An ordinal like "1st", "2nd", "3rd", "21st" — part of the street name ("2nd Avenue"), not a
+/// house-number prefix.
+fn is_ordinal_token(token: &str) -> bool {
+    let split = token.len().saturating_sub(2);
+    let (digits, suffix) = token.split_at(split);
+    !digits.is_empty()
+        && digits.chars().all(|ch| ch.is_ascii_digit())
+        && matches!(suffix, "st" | "nd" | "rd" | "th")
+}
+
+/// Leading address tokens that denote a unit/house number rather than the street itself.
+fn is_house_prefix_token(token: &str) -> bool {
+    if is_ordinal_token(token) {
+        return false;
+    }
+    matches!(
+        token,
+        "flat" | "fl" | "apartment" | "apt" | "unit" | "no" | "block" | "floor" | "room"
+    ) || token.len() == 1
+        || token.chars().all(|ch| ch.is_ascii_digit())
+        || (token.chars().next().is_some_and(|ch| ch.is_ascii_digit())
+            && token.chars().any(|ch| ch.is_ascii_alphabetic()))
+}
+
+/// Street-level key for an address: drops the leading house-number / flat prefix so that
+/// "12 Baker Street" and "5 Baker Street" collapse to a single street entry.
+fn street_key(address: &str) -> String {
+    let tokens = tokenize_address_text(address);
+    let mut start = 0;
+    while start < tokens.len() && is_house_prefix_token(&tokens[start]) {
+        start += 1;
+    }
+    if start >= tokens.len() {
+        return tokens.join(" ");
+    }
+    tokens[start..].join(" ")
+}
+
+/// Road-type words. Their presence (with no house number) marks a road browse, which we
+/// collapse to one result per street.
+const ROAD_TYPE_TOKENS: &[&str] = &[
+    "street",
+    "st",
+    "road",
+    "rd",
+    "lane",
+    "ln",
+    "avenue",
+    "ave",
+    "close",
+    "cl",
+    "drive",
+    "dr",
+    "way",
+    "court",
+    "ct",
+    "crescent",
+    "cres",
+    "place",
+    "terrace",
+    "terr",
+    "grove",
+    "gardens",
+    "gdns",
+    "walk",
+    "row",
+    "square",
+    "sq",
+    "hill",
+    "parade",
+    "mews",
+    "embankment",
+    "broadway",
+    "boulevard",
+    "blvd",
+];
+
+fn query_has_road_type(query: &str) -> bool {
+    tokenize_address_text(query)
+        .iter()
+        .any(|token| ROAD_TYPE_TOKENS.contains(&token.as_str()))
+}
+
+/// The outward code (everything before the space) of a canonical postcode.
+fn outcode_of(postcode: &str) -> &str {
+    postcode.split(' ').next().unwrap_or(postcode)
+}
+
 fn parse_address_query(query: &str) -> AddressQuery {
    let tokens = tokenize_address_text(query);
    let (full_postcode, postcode_token_indices) = extract_full_postcode(&tokens)
@ -449,12 +588,45 @@ fn parse_address_query(query: &str) -> AddressQuery {
        .unwrap_or((None, Vec::new()));

    let skip_postcode_tokens: FxHashSet<usize> = postcode_token_indices.into_iter().collect();
+
+    // Recover an appended partial postcode (outcode, or outcode + sector digit) as a ranking
+    // bias rather than discarding it — but only from the TRAILING position, so a leading road
+    // designation like "A4 Great West Road" is not mistaken for an area refinement.
+    let mut postcode_area: Option<String> = None;
+    let mut consumed_partial_tokens: FxHashSet<usize> = FxHashSet::default();
+    if full_postcode.is_none() && !tokens.is_empty() {
+        let last = tokens.len() - 1;
+        if !skip_postcode_tokens.contains(&last) {
+            let sector_digit =
+                tokens[last].len() == 1 && tokens[last].chars().all(|ch| ch.is_ascii_digit());
+            if last >= 1
+                && sector_digit
+                && !skip_postcode_tokens.contains(&(last - 1))
+                && looks_like_postcode_fragment(&tokens[last - 1])
+            {
+                postcode_area = Some(format!(
+                    "{}{}",
+                    tokens[last - 1].to_ascii_uppercase(),
+                    tokens[last]
+                ));
+                consumed_partial_tokens.insert(last);
+                consumed_partial_tokens.insert(last - 1);
+            } else if looks_like_postcode_fragment(&tokens[last]) {
+                postcode_area = Some(tokens[last].to_ascii_uppercase());
+                consumed_partial_tokens.insert(last);
+            }
+        }
+    }
+
    let mut text_groups = Vec::new();
    let mut numeric_terms = Vec::new();
    let mut candidate_terms = Vec::new();

    for (idx, token) in tokens.iter().enumerate() {
-        if skip_postcode_tokens.contains(&idx) || looks_like_postcode_fragment(token) {
+        if skip_postcode_tokens.contains(&idx)
+            || consumed_partial_tokens.contains(&idx)
+            || looks_like_postcode_fragment(token)
+        {
            continue;
        }

@ -486,6 +658,7 @@ fn parse_address_query(query: &str) -> AddressQuery {

    AddressQuery {
        full_postcode,
+        postcode_area,
        text_groups,
        numeric_terms,
        candidate_terms,
@ -897,9 +1070,15 @@ impl PropertyData {
        &self.address_search_token_keys[offset..offset + length]
    }

-    /// Search individual property addresses. Full postcode queries use a direct row index;
-    /// free-text queries use a small inverted index over distinctive address tokens.
-    pub fn search_addresses(&self, query: &str, limit: usize) -> Vec<usize> {
+    /// Search individual property addresses, returning `(row, score)` ranked best-first.
+    ///
+    /// Candidate rows come from intersecting the posting lists of the distinctive words the
+    /// user typed in full (so "Cherry Hinton Road" narrows to rows containing both), unioned
+    /// with the exact-postcode rows when a complete postcode is present (so a postcode is a
+    /// boost, not an all-or-nothing gate). An appended partial postcode keeps in-area rows
+    /// ahead of the candidate cut and adds a scoring bias. With a road-type word and no house
+    /// number, results collapse to one row per street.
+    pub fn search_addresses(&self, query: &str, limit: usize) -> Vec<(usize, i32)> {
        if limit == 0 {
            return Vec::new();
        }
@ -912,25 +1091,45 @@ impl PropertyData {
            return Vec::new();
        }

-        let candidate_rows: Vec<u32> = if let Some(postcode) = parsed.full_postcode.as_deref() {
-            self.postcode_interner
+        let mut candidate_rows = self.address_candidate_rows(&parsed.candidate_terms);
+
+        // A complete postcode contributes its rows too, instead of replacing the road match.
+        if let Some(postcode) = parsed.full_postcode.as_deref() {
+            if let Some(rows) = self
+                .postcode_interner
                .get(postcode)
                .and_then(|key| self.postcode_row_index.get(&key))
-                .map(|rows| rows.to_vec())
-                .unwrap_or_default()
-        } else if let Some(rows) = self.best_address_token_rows(&parsed.candidate_terms) {
-            rows.iter()
-                .take(ADDRESS_SEARCH_CANDIDATE_LIMIT)
-                .copied()
-                .collect()
-        } else {
-            Vec::new()
-        };
+            {
+                candidate_rows = if candidate_rows.is_empty() {
+                    rows.clone()
+                } else {
+                    union_sorted(&candidate_rows, rows)
+                };
+            }
+        }

        if candidate_rows.is_empty() {
            return Vec::new();
        }

+        // When the user appended a partial postcode, keep in-area rows ahead of the cut so the
+        // refinement still surfaces even for very common roads. Single pass (stable partition) so
+        // the postcode check — which allocates — runs exactly once per candidate.
+        if let Some(area) = parsed.postcode_area.as_deref() {
+            let mut in_area = Vec::new();
+            let mut others = Vec::new();
+            for &row in &candidate_rows {
+                if self.row_postcode_in_area(row as usize, area) {
+                    in_area.push(row);
+                } else {
+                    others.push(row);
+                }
+            }
+            in_area.extend(others);
+            candidate_rows = in_area;
+        }
+        candidate_rows.truncate(ADDRESS_SEARCH_CANDIDATE_LIMIT);
+
        let mut scored: Vec<(i32, usize, usize)> = candidate_rows
            .into_iter()
            .filter_map(|row| {
@ -948,18 +1147,29 @@ impl PropertyData {
                .then(left.2.cmp(&right.2))
        });

+        // Collapse a road browse (road-type word, no house number) to one row per street.
+        let collapse_streets = parsed.numeric_terms.is_empty() && query_has_road_type(query);
+
        let mut seen = FxHashSet::default();
        let mut results = Vec::with_capacity(limit);
-        for (_, _, row) in scored {
+        for (score, _, row) in scored {
            let address = self.address(row).trim();
            if address.is_empty() {
                continue;
            }
-            let key = format!("{}\n{}", address.to_ascii_lowercase(), self.postcode(row));
+            let key = if collapse_streets {
+                format!(
+                    "{}\n{}",
+                    street_key(address),
+                    outcode_of(self.postcode(row))
+                )
+            } else {
+                format!("{}\n{}", address.to_ascii_lowercase(), self.postcode(row))
+            };
            if !seen.insert(key) {
                continue;
            }
-            results.push(row);
+            results.push((row, score));
            if results.len() == limit {
                break;
            }
@ -968,36 +1178,75 @@ impl PropertyData {
        results
    }

-    fn best_address_token_rows(&self, terms: &[String]) -> Option<&[u32]> {
-        let mut best: Option<&[u32]> = None;
-
-        for term in terms {
-            if let Some(rows) = self.address_token_index.get(term) {
-                if best.is_none_or(|current| rows.len() < current.len()) {
-                    best = Some(rows.as_slice());
-                }
-                continue;
-            }
-
-            if term.len() < 4 {
-                continue;
-            }
-
-            if let Some(tokens) = self.address_prefix_index.get(address_prefix_key(term)) {
-                for token in tokens {
-                    if !token.starts_with(term) {
-                        continue;
-                    }
-                    if let Some(rows) = self.address_token_index.get(token) {
-                        if best.is_none_or(|current| rows.len() < current.len()) {
-                            best = Some(rows.as_slice());
-                        }
-                    }
-                }
+    /// True when the row's postcode begins with the compact partial-postcode `area`
+    /// (e.g. "NW1" or "NW16" matches "NW1 6XE").
+    fn row_postcode_in_area(&self, row: usize, area: &str) -> bool {
+        let mut compact = String::new();
+        for ch in self.postcode(row).chars() {
+            if !ch.is_whitespace() {
+                compact.push(ch.to_ascii_uppercase());
            }
        }
+        compact.starts_with(area)
+    }

-        best
+    /// Candidate rows for the distinctive query words. Words typed in full intersect by their
+    /// exact posting lists (precise); a still-being-typed final word with no exact match seeds
+    /// from the smallest prefix-expanded posting list (so partial typing keeps working).
+    fn address_candidate_rows(&self, terms: &[String]) -> Vec<u32> {
+        let mut exact: Vec<&[u32]> = terms
+            .iter()
+            .filter_map(|term| self.address_token_index.get(term).map(Vec::as_slice))
+            .collect();
+
+        if !exact.is_empty() {
+            exact.sort_by_key(|rows| rows.len());
+            let mut acc = exact[0].to_vec();
+            for rows in &exact[1..] {
+                if acc.is_empty() {
+                    break;
+                }
+                acc = intersect_sorted(&acc, rows);
+            }
+            return acc;
+        }
+
+        self.prefix_seed_rows(terms)
+    }
+
+    /// Seed rows from the smallest prefix-expanded term — used only when no word matched an
+    /// indexed token exactly (i.e. the user is still typing the final word).
+    fn prefix_seed_rows(&self, terms: &[String]) -> Vec<u32> {
+        let mut best: Option<Vec<u32>> = None;
+        for term in terms {
+            if term.len() < ADDRESS_SEARCH_PREFIX_MIN_LEN {
+                continue;
+            }
+            let Some(tokens) = self.address_prefix_index.get(address_prefix_key(term)) else {
+                continue;
+            };
+            let mut union: Vec<u32> = Vec::new();
+            for token in tokens {
+                if !token.starts_with(term) {
+                    continue;
+                }
+                if let Some(rows) = self.address_token_index.get(token) {
+                    union = if union.is_empty() {
+                        rows.clone()
+                    } else {
+                        union_sorted(&union, rows)
+                    };
+                }
+            }
+            if !union.is_empty()
+                && best
+                    .as_ref()
+                    .is_none_or(|current| union.len() < current.len())
+            {
+                best = Some(union);
+            }
+        }
+        best.unwrap_or_default()
    }

    fn address_match_score(&self, row: usize, parsed: &AddressQuery) -> Option<i32> {
@ -1037,6 +1286,12 @@ impl PropertyData {
        if numeric_matches == parsed.numeric_terms.len() && numeric_matches > 0 {
            score += 50;
        }
+        // Additive bias (never a filter) when the row sits in the appended partial postcode.
+        if let Some(area) = parsed.postcode_area.as_deref() {
+            if self.row_postcode_in_area(row, area) {
+                score += 400;
+            }
+        }

        Some(score)
    }
@ -1969,16 +2224,23 @@ impl PropertyData {
                }
            }
        }
-        let address_token_count_before_prune = address_token_index.len();
-        address_token_index.retain(|_, rows| rows.len() <= ADDRESS_SEARCH_MAX_POSTINGS_PER_TOKEN);
+        // Keep every distinctive token: common road words ("high", "church", "station") are
+        // exactly what people search, and dropping them made those roads unsearchable while a
+        // prefix fallback surfaced the wrong street ("Highbury" for "High"). The candidate scan
+        // is bounded per query instead (ADDRESS_SEARCH_CANDIDATE_LIMIT), and stop words are
+        // already excluded from the index, so the largest posting lists stay modest.
+        let max_postings = address_token_index
+            .values()
+            .map(Vec::len)
+            .max()
+            .unwrap_or(0);
        let address_prefix_index = build_address_prefix_index(&address_token_index);
        let address_search_interner = address_search_rodeo.into_reader();
        let address_postings_count: usize = address_token_index.values().map(Vec::len).sum();
        tracing::info!(
            tokens = address_token_index.len(),
            prefixes = address_prefix_index.len(),
-            pruned_tokens =
-                address_token_count_before_prune.saturating_sub(address_token_index.len()),
+            max_postings_per_token = max_postings,
            postings = address_postings_count,
            row_tokens = address_search_token_keys.len(),
            "Address search index built"
@ -2340,6 +2602,79 @@ mod tests {
        assert_eq!(parsed.candidate_terms, vec!["downing".to_string()]);
    }

+    #[test]
+    fn address_query_recovers_appended_partial_postcode_as_bias() {
+        let parsed = parse_address_query("Baker Street NW1");
+        assert_eq!(parsed.full_postcode, None);
+        assert_eq!(parsed.postcode_area.as_deref(), Some("NW1"));
+        // The road words are still searchable; the postcode fragment did not consume them.
+        assert_eq!(parsed.candidate_terms, vec!["baker".to_string()]);
+        assert!(parsed.numeric_terms.is_empty());
+    }
+
+    #[test]
+    fn address_query_recovers_outcode_plus_sector_without_a_phantom_house_number() {
+        let parsed = parse_address_query("High Street CR0 2");
+        assert_eq!(parsed.postcode_area.as_deref(), Some("CR02"));
+        // The lone sector digit must not be treated as a house number.
+        assert!(parsed.numeric_terms.is_empty());
+        assert_eq!(parsed.candidate_terms, vec!["high".to_string()]);
+    }
+
+    #[test]
+    fn full_postcode_takes_precedence_over_partial_bias() {
+        let parsed = parse_address_query("Baker Street NW1 6XE");
+        assert_eq!(parsed.full_postcode.as_deref(), Some("NW1 6XE"));
+        assert_eq!(parsed.postcode_area, None);
+    }
+
+    #[test]
+    fn intersect_and_union_sorted_row_ids() {
+        assert_eq!(
+            intersect_sorted(&[1, 2, 3, 5], &[2, 3, 4, 5]),
+            vec![2, 3, 5]
+        );
+        assert_eq!(intersect_sorted(&[1, 2], &[3, 4]), Vec::<u32>::new());
+        assert_eq!(union_sorted(&[1, 3, 5], &[2, 3, 4]), vec![1, 2, 3, 4, 5]);
+        assert_eq!(union_sorted(&[], &[2, 4]), vec![2, 4]);
+    }
+
+    #[test]
+    fn street_key_collapses_house_numbers_and_flats() {
+        assert_eq!(street_key("12 Baker Street"), "baker street");
+        assert_eq!(street_key("5 Baker Street"), "baker street");
+        assert_eq!(street_key("Flat 2, 10 Downing Street"), "downing street");
+        assert_eq!(street_key("221B Baker Street"), "baker street");
+    }
+
+    #[test]
+    fn street_key_keeps_ordinal_street_names() {
+        // Ordinals are part of the street name, not a house-number prefix.
+        assert_eq!(street_key("2nd Avenue"), "2nd avenue");
+        assert_eq!(street_key("12 3rd Avenue"), "3rd avenue");
+        assert!(is_ordinal_token("21st"));
+        assert!(!is_ordinal_token("21"));
+        assert!(!is_ordinal_token("221b"));
+    }
+
+    #[test]
+    fn postcode_area_recovered_only_from_the_trailing_position() {
+        // A leading road designation must NOT be taken as an area refinement.
+        let parsed = parse_address_query("A4 Great West Road");
+        assert_eq!(parsed.postcode_area, None);
+        // A genuine trailing outcode still is.
+        let trailing = parse_address_query("Great West Road W4");
+        assert_eq!(trailing.postcode_area.as_deref(), Some("W4"));
+    }
+
+    #[test]
+    fn road_type_detection() {
+        assert!(query_has_road_type("high street"));
+        assert!(query_has_road_type("acacia avenue"));
+        assert!(!query_has_road_type("acacia"));
+        assert!(!query_has_road_type("london"));
+    }
+
    #[test]
    fn address_query_parsing_keeps_partial_terms_for_row_matching() {
        let parsed = parse_address_query("settlers cour");