use std::path::Path; use anyhow::Context; use polars::frame::DataFrame; use polars::lazy::frame::LazyFrame; use polars::prelude::*; use tracing::info; use crate::utils::InternedColumn; pub struct PlaceData { pub name: Vec, pub name_lower: Vec, pub name_search: Vec, pub place_type: InternedColumn, pub type_rank: Vec, pub population: Vec, pub lat: Vec, pub lon: Vec, pub city: Vec>, pub travel_destination: Vec, } #[derive(Clone, Copy)] pub(super) struct CityCandidate<'a> { name: &'a str, lat: f32, lon: f32, population: u32, max_dist_sq: f32, } const PARENT_CITY_MAX_DIST_SQ: f32 = 0.81; const LONDON_DISPLAY_MAX_DEGREES: f32 = 30.0 / 111.0; const LONDON_DISPLAY_MAX_DIST_SQ: f32 = LONDON_DISPLAY_MAX_DEGREES * LONDON_DISPLAY_MAX_DEGREES; const SUBSUMED_CITY_MAX_DEGREES: f32 = 5.0 / 111.0; const SUBSUMED_CITY_MAX_DIST_SQ: f32 = SUBSUMED_CITY_MAX_DEGREES * SUBSUMED_CITY_MAX_DEGREES; const SUBSUMED_CITY_MIN_POPULATION_RATIO: u32 = 10; fn type_rank(place_type: &str) -> u8 { match place_type { "city" => 0, "town" => 1, "village" => 2, "suburb" | "neighbourhood" | "quarter" | "borough" | "locality" => 3, "station" | "university" => 4, "hamlet" | "isolated_dwelling" | "island" => 5, _ => 6, } } pub fn is_travel_destination_type(place_type: &str) -> bool { matches!(place_type, "city" | "station" | "university") } impl<'a> CityCandidate<'a> { fn from_place(name: &'a str, lat: f32, lon: f32, population: u32) -> Self { let max_dist_sq = if name == "London" { LONDON_DISPLAY_MAX_DIST_SQ } else { PARENT_CITY_MAX_DIST_SQ }; Self { name, lat, lon, population, max_dist_sq, } } fn distance_sq(&self, lat: f32, lon: f32, cos_lat: f32) -> f32 { let dlat = self.lat - lat; let dlon = (self.lon - lon) * cos_lat; dlat * dlat + dlon * dlon } fn is_subsumed_by(&self, other: &Self) -> bool { if self.population == 0 { return false; } let min_parent_population = u64::from(self.population) * u64::from(SUBSUMED_CITY_MIN_POPULATION_RATIO); if u64::from(other.population) < min_parent_population { return false; } other.distance_sq(self.lat, self.lon, self.lat.to_radians().cos()) < SUBSUMED_CITY_MAX_DIST_SQ } } pub(super) fn display_city_candidates<'a>( names: &'a [String], type_rank: &[u8], population: &[u32], lat: &[f32], lon: &[f32], ) -> Vec> { let cities: Vec> = type_rank .iter() .enumerate() .filter_map(|(idx, &rank)| { if rank == 0 { Some(CityCandidate::from_place( &names[idx], lat[idx], lon[idx], population[idx], )) } else { None } }) .collect(); cities .iter() .enumerate() .filter_map(|(idx, city)| { let is_subsumed = cities .iter() .enumerate() .any(|(other_idx, other)| other_idx != idx && city.is_subsumed_by(other)); (!is_subsumed).then_some(*city) }) .collect() } pub(super) fn nearest_display_city<'a>( lat: f32, lon: f32, cities: &'a [CityCandidate<'a>], ) -> Option<&'a str> { let cos_lat = lat.to_radians().cos(); let (best_city, best_dist_sq) = cities .iter() .map(|city| (city, city.distance_sq(lat, lon, cos_lat))) .min_by(|(_, lhs), (_, rhs)| lhs.total_cmp(rhs))?; (best_dist_sq < best_city.max_dist_sq).then_some(best_city.name) } pub fn normalize_search_text(text: &str) -> String { let mut result = String::with_capacity(text.len()); let mut last_was_space = true; for ch in text.chars() { if ch == '\'' || ch == '’' || ch == '`' { continue; } let lower = ch.to_ascii_lowercase(); if lower.is_ascii_alphanumeric() { result.push(lower); last_was_space = false; } else if !last_was_space { result.push(' '); last_was_space = true; } } if result.ends_with(' ') { result.pop(); } result } fn replace_token(text: &str, from: &str, to: &str) -> Option { let mut changed = false; let replaced: Vec<&str> = text .split_whitespace() .map(|token| { if token == from { changed = true; to } else { token } }) .collect(); changed.then(|| replaced.join(" ")) } fn push_alias(aliases: &mut Vec, alias: String) { if !alias.is_empty() && !aliases.iter().any(|existing| existing == &alias) { aliases.push(alias); } } fn build_search_text(name: &str, place_type: &str) -> String { let primary = normalize_search_text(name); let mut aliases = vec![primary.clone()]; if let Some(alias) = replace_token(&primary, "st", "saint") { push_alias(&mut aliases, alias); } if let Some(alias) = replace_token(&primary, "saint", "st") { push_alias(&mut aliases, alias); } if place_type == "station" { let suffix_aliases: [(&str, &[&str]); 6] = [ ( " tube station", &[" underground station", " station", " tube", " underground"], ), ( " underground station", &[" tube station", " station", " tube", " underground"], ), ( " railway station", &[" rail station", " station", " railway", " rail"], ), ( " overground station", &[" station", " overground", " railway station"], ), ( " elizabeth line station", &[" station", " elizabeth line", " crossrail station"], ), (" dlr station", &[" station", " dlr"]), ]; for (suffix, replacements) in suffix_aliases { if let Some(stem) = primary.strip_suffix(suffix) { for replacement in replacements { push_alias(&mut aliases, format!("{stem}{replacement}")); } } } } aliases.join(" | ") } fn extract_str_col(df: &DataFrame, name: &str) -> anyhow::Result> { let column = df .column(name) .with_context(|| format!("Missing column '{name}' in places data"))?; let string_column = column .str() .with_context(|| format!("Column '{name}' is not a string column"))?; string_column .into_iter() .enumerate() .map(|(row, value)| { value .map(ToString::to_string) .with_context(|| format!("Column '{name}' has null at row {row}")) }) .collect() } fn extract_f32_col(df: &DataFrame, name: &str) -> anyhow::Result> { let column = df .column(name) .with_context(|| format!("Missing column '{name}' in places data"))?; let cast = column .cast(&DataType::Float32) .with_context(|| format!("Failed to cast column '{name}' to Float32"))?; let float_column = cast .f32() .with_context(|| format!("Column '{name}' is not a float32 column"))?; float_column .into_iter() .enumerate() .map(|(row, value)| value.with_context(|| format!("Column '{name}' has null at row {row}"))) .collect() } fn extract_bool_col(df: &DataFrame, name: &str) -> anyhow::Result> { let column = df .column(name) .with_context(|| format!("Missing column '{name}' in places data"))?; let bool_column = column .bool() .with_context(|| format!("Column '{name}' is not a boolean column"))?; bool_column .into_iter() .enumerate() .map(|(row, value)| value.with_context(|| format!("Column '{name}' has null at row {row}"))) .collect() } fn extract_optional_str_col( df: &DataFrame, name: &str, ) -> anyhow::Result>>> { let column = match df.column(name) { Ok(column) => column, Err(_) => return Ok(None), }; let string_column = column .str() .with_context(|| format!("Column '{name}' is not a string column"))?; Ok(Some( string_column .into_iter() .map(|value| value.map(ToString::to_string)) .collect(), )) } impl PlaceData { pub fn load(parquet_path: &Path) -> anyhow::Result { super::run_polars_io(|| Self::load_inner(parquet_path)) } fn load_inner(parquet_path: &Path) -> anyhow::Result { info!("Loading place data from {:?}...", parquet_path); let parquet_path = PlRefPath::try_from_path(parquet_path) .context("Failed to normalize places parquet path")?; let df = LazyFrame::scan_parquet(parquet_path, Default::default()) .context("Failed to scan places parquet")? .collect() .context("Failed to read places parquet")?; let row_count = df.height(); info!("Loaded {} places", row_count); let name = extract_str_col(&df, "name")?; let place_type_raw = extract_str_col(&df, "place_type")?; let lat = extract_f32_col(&df, "lat")?; let lon = extract_f32_col(&df, "lon")?; let population: Vec = if df.column("population").is_ok() { let pop_f32 = extract_f32_col(&df, "population")?; pop_f32 .iter() .map(|&val| val.max(0.0).min(u32::MAX as f32) as u32) .collect() } else { vec![0; row_count] }; let name_lower: Vec = name.iter().map(|nm| nm.to_lowercase()).collect(); let name_search: Vec = name .iter() .zip(&place_type_raw) .map(|(nm, pt)| build_search_text(nm, pt)) .collect(); let type_rank_vec: Vec = place_type_raw.iter().map(|pt| type_rank(pt)).collect(); let place_type = InternedColumn::build(&place_type_raw); let travel_destination = if df.column("travel_destination").is_ok() { extract_bool_col(&df, "travel_destination")? } else { place_type_raw .iter() .map(|place_type| is_travel_destination_type(place_type)) .collect() }; let display_city_override = extract_optional_str_col(&df, "display_city")?; // Precompute nearest city for each non-city place let city_candidates = display_city_candidates(&name, &type_rank_vec, &population, &lat, &lon); let fallback_city: Vec> = (0..row_count) .map(|idx| { if type_rank_vec[idx] == 0 { return None; // Cities don't need a city label } nearest_display_city(lat[idx], lon[idx], &city_candidates).map(str::to_string) }) .collect(); let city: Vec> = if let Some(display_city_override) = display_city_override { fallback_city .into_iter() .zip(display_city_override) .enumerate() .map(|(idx, (fallback, override_city))| { if type_rank_vec[idx] == 0 { return None; } override_city .and_then(|value| { let trimmed = value.trim(); (!trimmed.is_empty()).then(|| trimmed.to_string()) }) .or(fallback) }) .collect() } else { fallback_city }; let with_pop = population.iter().filter(|&&pop| pop > 0).count(); let with_city = city.iter().filter(|c| c.is_some()).count(); info!( places = row_count, types = place_type.values.len(), with_population = with_pop, with_city = with_city, "Place data loaded" ); Ok(PlaceData { name, name_lower, name_search, place_type, type_rank: type_rank_vec, population, lat, lon, city, travel_destination, }) } } #[cfg(test)] mod tests { use super::*; fn test_city_rows() -> [(&'static str, f32, f32, u32); 5] { [ ("London", 51.507_446, -0.1277653, 8_908_083), ("Westminster", 51.497_322, -0.137149, 211_365), ("City of London", 51.515_617, -0.0919983, 10_847), ("Cambridge", 52.205_532, 0.1186637, 145_818), ("Oxford", 51.752_014, -1.2578499, 165_000), ] } fn all_test_city_candidates() -> Vec> { test_city_rows() .into_iter() .map(|(name, lat, lon, population)| { CityCandidate::from_place(name, lat, lon, population) }) .collect() } fn test_city_candidates() -> Vec> { let cities = all_test_city_candidates(); cities .iter() .enumerate() .filter_map(|(idx, city)| { let is_subsumed = cities .iter() .enumerate() .any(|(other_idx, other)| other_idx != idx && city.is_subsumed_by(other)); (!is_subsumed).then_some(*city) }) .collect() } #[test] fn type_rank_ordering() { assert!(type_rank("city") < type_rank("town")); assert!(type_rank("town") < type_rank("station")); assert!(type_rank("station") < type_rank("unknown")); } #[test] fn search_text_handles_common_address_variants() { assert!(build_search_text("King's Cross tube station", "station") .contains("kings cross underground")); assert!(build_search_text("St Albans", "city").contains("saint albans")); assert!(build_search_text("Shadwell DLR station", "station").contains("shadwell station")); } #[test] fn travel_destination_types_match_legacy_places() { assert!(is_travel_destination_type("city")); assert!(is_travel_destination_type("station")); assert!(!is_travel_destination_type("town")); assert!(!is_travel_destination_type("suburb")); } #[test] fn display_city_candidates_drop_city_nodes_subsumed_by_much_larger_nearby_city() { let rows = test_city_rows(); let names: Vec = rows .iter() .map(|(name, _, _, _)| name.to_string()) .collect(); let type_rank: Vec = vec![0; rows.len()]; let population: Vec = rows .iter() .map(|(_, _, _, population)| *population) .collect(); let lat: Vec = rows.iter().map(|(_, lat, _, _)| *lat).collect(); let lon: Vec = rows.iter().map(|(_, _, lon, _)| *lon).collect(); let cities = display_city_candidates(&names, &type_rank, &population, &lat, &lon); assert_eq!( cities.iter().map(|city| city.name).collect::>(), ["London", "Cambridge", "Oxford"] ); } #[test] fn nearest_display_city_labels_inner_greater_london_from_london_candidate() { let cities = test_city_candidates(); assert_eq!( nearest_display_city(51.371_304, -0.101957, &cities), Some("London") ); } #[test] fn nearest_display_city_preserves_non_london_duplicates() { let cities = test_city_candidates(); assert_eq!( nearest_display_city(52.127_77, -0.0813098, &cities), Some("Cambridge") ); } #[test] fn nearest_display_city_does_not_extend_london_past_its_display_radius() { let cities = test_city_candidates(); assert_eq!(nearest_display_city(51.5093, -0.5954, &cities), None); } #[test] fn nearest_display_city_keeps_normal_non_london_city() { let cities = test_city_candidates(); assert_eq!( nearest_display_city(51.456659, -0.969651, &cities), Some("Oxford") ); } }