seems fine

This commit is contained in:
Andras Schmelczer 2026-05-05 22:29:28 +01:00
parent 48983e3b4b
commit 7a1696541f
37 changed files with 4999 additions and 1242 deletions

View file

@ -11,22 +11,127 @@ use crate::utils::InternedColumn;
pub struct PlaceData {
pub name: Vec<String>,
pub name_lower: Vec<String>,
pub name_search: Vec<String>,
pub place_type: InternedColumn,
pub type_rank: Vec<u8>,
pub population: Vec<u32>,
pub lat: Vec<f32>,
pub lon: Vec<f32>,
pub city: Vec<Option<String>>,
pub travel_destination: Vec<bool>,
}
fn type_rank(place_type: &str) -> u8 {
match place_type {
"city" => 0,
"station" => 1,
_ => 2,
"town" => 1,
"village" => 2,
"suburb" | "neighbourhood" | "quarter" | "borough" | "locality" => 3,
"station" => 4,
"hamlet" | "isolated_dwelling" | "island" => 5,
_ => 6,
}
}
pub fn is_travel_destination_type(place_type: &str) -> bool {
matches!(place_type, "city" | "station")
}
pub fn normalize_search_text(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut last_was_space = true;
for ch in text.chars() {
if ch == '\'' || ch == '' || ch == '`' {
continue;
}
let lower = ch.to_ascii_lowercase();
if lower.is_ascii_alphanumeric() {
result.push(lower);
last_was_space = false;
} else if !last_was_space {
result.push(' ');
last_was_space = true;
}
}
if result.ends_with(' ') {
result.pop();
}
result
}
fn replace_token(text: &str, from: &str, to: &str) -> Option<String> {
let mut changed = false;
let replaced: Vec<&str> = text
.split_whitespace()
.map(|token| {
if token == from {
changed = true;
to
} else {
token
}
})
.collect();
changed.then(|| replaced.join(" "))
}
fn push_alias(aliases: &mut Vec<String>, alias: String) {
if !alias.is_empty() && !aliases.iter().any(|existing| existing == &alias) {
aliases.push(alias);
}
}
fn build_search_text(name: &str, place_type: &str) -> String {
let primary = normalize_search_text(name);
let mut aliases = vec![primary.clone()];
if let Some(alias) = replace_token(&primary, "st", "saint") {
push_alias(&mut aliases, alias);
}
if let Some(alias) = replace_token(&primary, "saint", "st") {
push_alias(&mut aliases, alias);
}
if place_type == "station" {
let suffix_aliases: [(&str, &[&str]); 5] = [
(
" tube station",
&[" underground station", " station", " tube", " underground"],
),
(
" underground station",
&[" tube station", " station", " tube", " underground"],
),
(
" railway station",
&[" rail station", " station", " railway", " rail"],
),
(
" overground station",
&[" station", " overground", " railway station"],
),
(
" elizabeth line station",
&[" station", " elizabeth line", " crossrail station"],
),
];
for (suffix, replacements) in suffix_aliases {
if let Some(stem) = primary.strip_suffix(suffix) {
for replacement in replacements {
push_alias(&mut aliases, format!("{stem}{replacement}"));
}
}
}
}
aliases.join(" | ")
}
fn extract_str_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<String>> {
let column = df
.column(name)
@ -56,6 +161,23 @@ fn extract_f32_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<f32>> {
.collect())
}
fn extract_bool_col_or_default(
df: &DataFrame,
name: &str,
default_value: bool,
) -> anyhow::Result<Vec<bool>> {
let Ok(column) = df.column(name) else {
return Ok(vec![default_value; df.height()]);
};
let bool_column = column
.bool()
.with_context(|| format!("Column '{name}' is not a boolean column"))?;
Ok(bool_column
.into_iter()
.map(|value| value.unwrap_or(default_value))
.collect())
}
impl PlaceData {
pub fn load(parquet_path: &Path) -> anyhow::Result<Self> {
info!("Loading place data from {:?}...", parquet_path);
@ -80,8 +202,21 @@ impl PlaceData {
};
let name_lower: Vec<String> = name.iter().map(|nm| nm.to_lowercase()).collect();
let name_search: Vec<String> = name
.iter()
.zip(&place_type_raw)
.map(|(nm, pt)| build_search_text(nm, pt))
.collect();
let type_rank_vec: Vec<u8> = place_type_raw.iter().map(|pt| type_rank(pt)).collect();
let place_type = InternedColumn::build(&place_type_raw);
let travel_destination = if df.column("travel_destination").is_ok() {
extract_bool_col_or_default(&df, "travel_destination", true)?
} else {
place_type_raw
.iter()
.map(|place_type| is_travel_destination_type(place_type))
.collect()
};
// Precompute nearest city for each non-city place
let city_indices: Vec<usize> = type_rank_vec
@ -133,12 +268,14 @@ impl PlaceData {
Ok(PlaceData {
name,
name_lower,
name_search,
place_type,
type_rank: type_rank_vec,
population,
lat,
lon,
city,
travel_destination,
})
}
}
@ -149,7 +286,23 @@ mod tests {
#[test]
fn type_rank_ordering() {
assert!(type_rank("city") < type_rank("station"));
assert!(type_rank("city") < type_rank("town"));
assert!(type_rank("town") < type_rank("station"));
assert!(type_rank("station") < type_rank("unknown"));
}
#[test]
fn search_text_handles_common_address_variants() {
assert!(build_search_text("King's Cross tube station", "station")
.contains("kings cross underground"));
assert!(build_search_text("St Albans", "city").contains("saint albans"));
}
#[test]
fn travel_destination_types_match_legacy_places() {
assert!(is_travel_destination_type("city"));
assert!(is_travel_destination_type("station"));
assert!(!is_travel_destination_type("town"));
assert!(!is_travel_destination_type("suburb"));
}
}