seems fine

2026-05-05 22:29:28 +01:00 · 2026-05-05 22:29:28 +01:00 · 7a1696541f
commit 7a1696541f
parent 48983e3b4b
37 changed files with 4999 additions and 1242 deletions
--- a/server-rs/src/data/places.rs
+++ b/server-rs/src/data/places.rs
@ -11,22 +11,127 @@ use crate::utils::InternedColumn;
 pub struct PlaceData {
    pub name: Vec<String>,
    pub name_lower: Vec<String>,
+    pub name_search: Vec<String>,
    pub place_type: InternedColumn,
    pub type_rank: Vec<u8>,
    pub population: Vec<u32>,
    pub lat: Vec<f32>,
    pub lon: Vec<f32>,
    pub city: Vec<Option<String>>,
+    pub travel_destination: Vec<bool>,
 }

 fn type_rank(place_type: &str) -> u8 {
    match place_type {
        "city" => 0,
-        "station" => 1,
-        _ => 2,
+        "town" => 1,
+        "village" => 2,
+        "suburb" | "neighbourhood" | "quarter" | "borough" | "locality" => 3,
+        "station" => 4,
+        "hamlet" | "isolated_dwelling" | "island" => 5,
+        _ => 6,
    }
 }

+pub fn is_travel_destination_type(place_type: &str) -> bool {
+    matches!(place_type, "city" | "station")
+}
+
+pub fn normalize_search_text(text: &str) -> String {
+    let mut result = String::with_capacity(text.len());
+    let mut last_was_space = true;
+
+    for ch in text.chars() {
+        if ch == '\'' || ch == '’' || ch == '`' {
+            continue;
+        }
+
+        let lower = ch.to_ascii_lowercase();
+        if lower.is_ascii_alphanumeric() {
+            result.push(lower);
+            last_was_space = false;
+        } else if !last_was_space {
+            result.push(' ');
+            last_was_space = true;
+        }
+    }
+
+    if result.ends_with(' ') {
+        result.pop();
+    }
+    result
+}
+
+fn replace_token(text: &str, from: &str, to: &str) -> Option<String> {
+    let mut changed = false;
+    let replaced: Vec<&str> = text
+        .split_whitespace()
+        .map(|token| {
+            if token == from {
+                changed = true;
+                to
+            } else {
+                token
+            }
+        })
+        .collect();
+
+    changed.then(|| replaced.join(" "))
+}
+
+fn push_alias(aliases: &mut Vec<String>, alias: String) {
+    if !alias.is_empty() && !aliases.iter().any(|existing| existing == &alias) {
+        aliases.push(alias);
+    }
+}
+
+fn build_search_text(name: &str, place_type: &str) -> String {
+    let primary = normalize_search_text(name);
+    let mut aliases = vec![primary.clone()];
+
+    if let Some(alias) = replace_token(&primary, "st", "saint") {
+        push_alias(&mut aliases, alias);
+    }
+    if let Some(alias) = replace_token(&primary, "saint", "st") {
+        push_alias(&mut aliases, alias);
+    }
+
+    if place_type == "station" {
+        let suffix_aliases: [(&str, &[&str]); 5] = [
+            (
+                " tube station",
+                &[" underground station", " station", " tube", " underground"],
+            ),
+            (
+                " underground station",
+                &[" tube station", " station", " tube", " underground"],
+            ),
+            (
+                " railway station",
+                &[" rail station", " station", " railway", " rail"],
+            ),
+            (
+                " overground station",
+                &[" station", " overground", " railway station"],
+            ),
+            (
+                " elizabeth line station",
+                &[" station", " elizabeth line", " crossrail station"],
+            ),
+        ];
+
+        for (suffix, replacements) in suffix_aliases {
+            if let Some(stem) = primary.strip_suffix(suffix) {
+                for replacement in replacements {
+                    push_alias(&mut aliases, format!("{stem}{replacement}"));
+                }
+            }
+        }
+    }
+
+    aliases.join(" | ")
+}
+
 fn extract_str_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<String>> {
    let column = df
        .column(name)
@ -56,6 +161,23 @@ fn extract_f32_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<f32>> {
        .collect())
 }

+fn extract_bool_col_or_default(
+    df: &DataFrame,
+    name: &str,
+    default_value: bool,
+) -> anyhow::Result<Vec<bool>> {
+    let Ok(column) = df.column(name) else {
+        return Ok(vec![default_value; df.height()]);
+    };
+    let bool_column = column
+        .bool()
+        .with_context(|| format!("Column '{name}' is not a boolean column"))?;
+    Ok(bool_column
+        .into_iter()
+        .map(|value| value.unwrap_or(default_value))
+        .collect())
+}
+
 impl PlaceData {
    pub fn load(parquet_path: &Path) -> anyhow::Result<Self> {
        info!("Loading place data from {:?}...", parquet_path);
@ -80,8 +202,21 @@ impl PlaceData {
        };

        let name_lower: Vec<String> = name.iter().map(|nm| nm.to_lowercase()).collect();
+        let name_search: Vec<String> = name
+            .iter()
+            .zip(&place_type_raw)
+            .map(|(nm, pt)| build_search_text(nm, pt))
+            .collect();
        let type_rank_vec: Vec<u8> = place_type_raw.iter().map(|pt| type_rank(pt)).collect();
        let place_type = InternedColumn::build(&place_type_raw);
+        let travel_destination = if df.column("travel_destination").is_ok() {
+            extract_bool_col_or_default(&df, "travel_destination", true)?
+        } else {
+            place_type_raw
+                .iter()
+                .map(|place_type| is_travel_destination_type(place_type))
+                .collect()
+        };

        // Precompute nearest city for each non-city place
        let city_indices: Vec<usize> = type_rank_vec
@ -133,12 +268,14 @@ impl PlaceData {
        Ok(PlaceData {
            name,
            name_lower,
+            name_search,
            place_type,
            type_rank: type_rank_vec,
            population,
            lat,
            lon,
            city,
+            travel_destination,
        })
    }
 }
@ -149,7 +286,23 @@ mod tests {

    #[test]
    fn type_rank_ordering() {
-        assert!(type_rank("city") < type_rank("station"));
+        assert!(type_rank("city") < type_rank("town"));
+        assert!(type_rank("town") < type_rank("station"));
        assert!(type_rank("station") < type_rank("unknown"));
    }
+
+    #[test]
+    fn search_text_handles_common_address_variants() {
+        assert!(build_search_text("King's Cross tube station", "station")
+            .contains("kings cross underground"));
+        assert!(build_search_text("St Albans", "city").contains("saint albans"));
+    }
+
+    #[test]
+    fn travel_destination_types_match_legacy_places() {
+        assert!(is_travel_destination_type("city"));
+        assert!(is_travel_destination_type("station"));
+        assert!(!is_travel_destination_type("town"));
+        assert!(!is_travel_destination_type("suburb"));
+    }
 }