perfect-postcode/server-rs/src/data/places.rs
2026-05-17 13:52:11 +01:00

540 lines
16 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use std::path::Path;
use anyhow::Context;
use polars::frame::DataFrame;
use polars::lazy::frame::LazyFrame;
use polars::prelude::*;
use tracing::info;
use crate::utils::InternedColumn;
pub struct PlaceData {
pub name: Vec<String>,
pub name_lower: Vec<String>,
pub name_search: Vec<String>,
pub place_type: InternedColumn,
pub type_rank: Vec<u8>,
pub population: Vec<u32>,
pub lat: Vec<f32>,
pub lon: Vec<f32>,
pub city: Vec<Option<String>>,
pub travel_destination: Vec<bool>,
}
#[derive(Clone, Copy)]
pub(super) struct CityCandidate<'a> {
name: &'a str,
lat: f32,
lon: f32,
population: u32,
max_dist_sq: f32,
}
const PARENT_CITY_MAX_DIST_SQ: f32 = 0.81;
const LONDON_DISPLAY_MAX_DEGREES: f32 = 30.0 / 111.0;
const LONDON_DISPLAY_MAX_DIST_SQ: f32 = LONDON_DISPLAY_MAX_DEGREES * LONDON_DISPLAY_MAX_DEGREES;
const SUBSUMED_CITY_MAX_DEGREES: f32 = 5.0 / 111.0;
const SUBSUMED_CITY_MAX_DIST_SQ: f32 = SUBSUMED_CITY_MAX_DEGREES * SUBSUMED_CITY_MAX_DEGREES;
const SUBSUMED_CITY_MIN_POPULATION_RATIO: u32 = 10;
fn type_rank(place_type: &str) -> u8 {
match place_type {
"city" => 0,
"town" => 1,
"village" => 2,
"suburb" | "neighbourhood" | "quarter" | "borough" | "locality" => 3,
"station" | "university" => 4,
"hamlet" | "isolated_dwelling" | "island" => 5,
_ => 6,
}
}
pub fn is_travel_destination_type(place_type: &str) -> bool {
matches!(place_type, "city" | "station" | "university")
}
impl<'a> CityCandidate<'a> {
fn from_place(name: &'a str, lat: f32, lon: f32, population: u32) -> Self {
let max_dist_sq = if name == "London" {
LONDON_DISPLAY_MAX_DIST_SQ
} else {
PARENT_CITY_MAX_DIST_SQ
};
Self {
name,
lat,
lon,
population,
max_dist_sq,
}
}
fn distance_sq(&self, lat: f32, lon: f32, cos_lat: f32) -> f32 {
let dlat = self.lat - lat;
let dlon = (self.lon - lon) * cos_lat;
dlat * dlat + dlon * dlon
}
fn is_subsumed_by(&self, other: &Self) -> bool {
if self.population == 0 {
return false;
}
let min_parent_population =
u64::from(self.population) * u64::from(SUBSUMED_CITY_MIN_POPULATION_RATIO);
if u64::from(other.population) < min_parent_population {
return false;
}
other.distance_sq(self.lat, self.lon, self.lat.to_radians().cos())
< SUBSUMED_CITY_MAX_DIST_SQ
}
}
pub(super) fn display_city_candidates<'a>(
names: &'a [String],
type_rank: &[u8],
population: &[u32],
lat: &[f32],
lon: &[f32],
) -> Vec<CityCandidate<'a>> {
let cities: Vec<CityCandidate<'_>> = type_rank
.iter()
.enumerate()
.filter_map(|(idx, &rank)| {
if rank == 0 {
Some(CityCandidate::from_place(
&names[idx],
lat[idx],
lon[idx],
population[idx],
))
} else {
None
}
})
.collect();
cities
.iter()
.enumerate()
.filter_map(|(idx, city)| {
let is_subsumed = cities
.iter()
.enumerate()
.any(|(other_idx, other)| other_idx != idx && city.is_subsumed_by(other));
(!is_subsumed).then_some(*city)
})
.collect()
}
pub(super) fn nearest_display_city<'a>(
lat: f32,
lon: f32,
cities: &'a [CityCandidate<'a>],
) -> Option<&'a str> {
let cos_lat = lat.to_radians().cos();
let (best_city, best_dist_sq) = cities
.iter()
.map(|city| (city, city.distance_sq(lat, lon, cos_lat)))
.min_by(|(_, lhs), (_, rhs)| lhs.total_cmp(rhs))?;
(best_dist_sq < best_city.max_dist_sq).then_some(best_city.name)
}
pub fn normalize_search_text(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut last_was_space = true;
for ch in text.chars() {
if ch == '\'' || ch == '' || ch == '`' {
continue;
}
let lower = ch.to_ascii_lowercase();
if lower.is_ascii_alphanumeric() {
result.push(lower);
last_was_space = false;
} else if !last_was_space {
result.push(' ');
last_was_space = true;
}
}
if result.ends_with(' ') {
result.pop();
}
result
}
fn replace_token(text: &str, from: &str, to: &str) -> Option<String> {
let mut changed = false;
let replaced: Vec<&str> = text
.split_whitespace()
.map(|token| {
if token == from {
changed = true;
to
} else {
token
}
})
.collect();
changed.then(|| replaced.join(" "))
}
fn push_alias(aliases: &mut Vec<String>, alias: String) {
if !alias.is_empty() && !aliases.iter().any(|existing| existing == &alias) {
aliases.push(alias);
}
}
fn build_search_text(name: &str, place_type: &str) -> String {
let primary = normalize_search_text(name);
let mut aliases = vec![primary.clone()];
if let Some(alias) = replace_token(&primary, "st", "saint") {
push_alias(&mut aliases, alias);
}
if let Some(alias) = replace_token(&primary, "saint", "st") {
push_alias(&mut aliases, alias);
}
if place_type == "station" {
let suffix_aliases: [(&str, &[&str]); 6] = [
(
" tube station",
&[" underground station", " station", " tube", " underground"],
),
(
" underground station",
&[" tube station", " station", " tube", " underground"],
),
(
" railway station",
&[" rail station", " station", " railway", " rail"],
),
(
" overground station",
&[" station", " overground", " railway station"],
),
(
" elizabeth line station",
&[" station", " elizabeth line", " crossrail station"],
),
(" dlr station", &[" station", " dlr"]),
];
for (suffix, replacements) in suffix_aliases {
if let Some(stem) = primary.strip_suffix(suffix) {
for replacement in replacements {
push_alias(&mut aliases, format!("{stem}{replacement}"));
}
}
}
}
aliases.join(" | ")
}
fn extract_str_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<String>> {
let column = df
.column(name)
.with_context(|| format!("Missing column '{name}' in places data"))?;
let string_column = column
.str()
.with_context(|| format!("Column '{name}' is not a string column"))?;
string_column
.into_iter()
.enumerate()
.map(|(row, value)| {
value
.map(ToString::to_string)
.with_context(|| format!("Column '{name}' has null at row {row}"))
})
.collect()
}
fn extract_f32_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<f32>> {
let column = df
.column(name)
.with_context(|| format!("Missing column '{name}' in places data"))?;
let cast = column
.cast(&DataType::Float32)
.with_context(|| format!("Failed to cast column '{name}' to Float32"))?;
let float_column = cast
.f32()
.with_context(|| format!("Column '{name}' is not a float32 column"))?;
float_column
.into_iter()
.enumerate()
.map(|(row, value)| value.with_context(|| format!("Column '{name}' has null at row {row}")))
.collect()
}
fn extract_bool_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<bool>> {
let column = df
.column(name)
.with_context(|| format!("Missing column '{name}' in places data"))?;
let bool_column = column
.bool()
.with_context(|| format!("Column '{name}' is not a boolean column"))?;
bool_column
.into_iter()
.enumerate()
.map(|(row, value)| value.with_context(|| format!("Column '{name}' has null at row {row}")))
.collect()
}
fn extract_optional_str_col(
df: &DataFrame,
name: &str,
) -> anyhow::Result<Option<Vec<Option<String>>>> {
let column = match df.column(name) {
Ok(column) => column,
Err(_) => return Ok(None),
};
let string_column = column
.str()
.with_context(|| format!("Column '{name}' is not a string column"))?;
Ok(Some(
string_column
.into_iter()
.map(|value| value.map(ToString::to_string))
.collect(),
))
}
impl PlaceData {
pub fn load(parquet_path: &Path) -> anyhow::Result<Self> {
super::run_polars_io(|| Self::load_inner(parquet_path))
}
fn load_inner(parquet_path: &Path) -> anyhow::Result<Self> {
info!("Loading place data from {:?}...", parquet_path);
let parquet_path = PlRefPath::try_from_path(parquet_path)
.context("Failed to normalize places parquet path")?;
let df = LazyFrame::scan_parquet(parquet_path, Default::default())
.context("Failed to scan places parquet")?
.collect()
.context("Failed to read places parquet")?;
let row_count = df.height();
info!("Loaded {} places", row_count);
let name = extract_str_col(&df, "name")?;
let place_type_raw = extract_str_col(&df, "place_type")?;
let lat = extract_f32_col(&df, "lat")?;
let lon = extract_f32_col(&df, "lon")?;
let population: Vec<u32> = if df.column("population").is_ok() {
let pop_f32 = extract_f32_col(&df, "population")?;
pop_f32
.iter()
.map(|&val| val.max(0.0).min(u32::MAX as f32) as u32)
.collect()
} else {
vec![0; row_count]
};
let name_lower: Vec<String> = name.iter().map(|nm| nm.to_lowercase()).collect();
let name_search: Vec<String> = name
.iter()
.zip(&place_type_raw)
.map(|(nm, pt)| build_search_text(nm, pt))
.collect();
let type_rank_vec: Vec<u8> = place_type_raw.iter().map(|pt| type_rank(pt)).collect();
let place_type = InternedColumn::build(&place_type_raw);
let travel_destination = if df.column("travel_destination").is_ok() {
extract_bool_col(&df, "travel_destination")?
} else {
place_type_raw
.iter()
.map(|place_type| is_travel_destination_type(place_type))
.collect()
};
let display_city_override = extract_optional_str_col(&df, "display_city")?;
// Precompute nearest city for each non-city place
let city_candidates =
display_city_candidates(&name, &type_rank_vec, &population, &lat, &lon);
let fallback_city: Vec<Option<String>> = (0..row_count)
.map(|idx| {
if type_rank_vec[idx] == 0 {
return None; // Cities don't need a city label
}
nearest_display_city(lat[idx], lon[idx], &city_candidates).map(str::to_string)
})
.collect();
let city: Vec<Option<String>> = if let Some(display_city_override) = display_city_override {
fallback_city
.into_iter()
.zip(display_city_override)
.enumerate()
.map(|(idx, (fallback, override_city))| {
if type_rank_vec[idx] == 0 {
return None;
}
override_city
.and_then(|value| {
let trimmed = value.trim();
(!trimmed.is_empty()).then(|| trimmed.to_string())
})
.or(fallback)
})
.collect()
} else {
fallback_city
};
let with_pop = population.iter().filter(|&&pop| pop > 0).count();
let with_city = city.iter().filter(|c| c.is_some()).count();
info!(
places = row_count,
types = place_type.values.len(),
with_population = with_pop,
with_city = with_city,
"Place data loaded"
);
Ok(PlaceData {
name,
name_lower,
name_search,
place_type,
type_rank: type_rank_vec,
population,
lat,
lon,
city,
travel_destination,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
fn test_city_rows() -> [(&'static str, f32, f32, u32); 5] {
[
("London", 51.507_446, -0.1277653, 8_908_083),
("Westminster", 51.497_322, -0.137149, 211_365),
("City of London", 51.515_617, -0.0919983, 10_847),
("Cambridge", 52.205_532, 0.1186637, 145_818),
("Oxford", 51.752_014, -1.2578499, 165_000),
]
}
fn all_test_city_candidates() -> Vec<CityCandidate<'static>> {
test_city_rows()
.into_iter()
.map(|(name, lat, lon, population)| {
CityCandidate::from_place(name, lat, lon, population)
})
.collect()
}
fn test_city_candidates() -> Vec<CityCandidate<'static>> {
let cities = all_test_city_candidates();
cities
.iter()
.enumerate()
.filter_map(|(idx, city)| {
let is_subsumed = cities
.iter()
.enumerate()
.any(|(other_idx, other)| other_idx != idx && city.is_subsumed_by(other));
(!is_subsumed).then_some(*city)
})
.collect()
}
#[test]
fn type_rank_ordering() {
assert!(type_rank("city") < type_rank("town"));
assert!(type_rank("town") < type_rank("station"));
assert!(type_rank("station") < type_rank("unknown"));
}
#[test]
fn search_text_handles_common_address_variants() {
assert!(build_search_text("King's Cross tube station", "station")
.contains("kings cross underground"));
assert!(build_search_text("St Albans", "city").contains("saint albans"));
assert!(build_search_text("Shadwell DLR station", "station").contains("shadwell station"));
}
#[test]
fn travel_destination_types_match_legacy_places() {
assert!(is_travel_destination_type("city"));
assert!(is_travel_destination_type("station"));
assert!(!is_travel_destination_type("town"));
assert!(!is_travel_destination_type("suburb"));
}
#[test]
fn display_city_candidates_drop_city_nodes_subsumed_by_much_larger_nearby_city() {
let rows = test_city_rows();
let names: Vec<String> = rows
.iter()
.map(|(name, _, _, _)| name.to_string())
.collect();
let type_rank: Vec<u8> = vec![0; rows.len()];
let population: Vec<u32> = rows
.iter()
.map(|(_, _, _, population)| *population)
.collect();
let lat: Vec<f32> = rows.iter().map(|(_, lat, _, _)| *lat).collect();
let lon: Vec<f32> = rows.iter().map(|(_, _, lon, _)| *lon).collect();
let cities = display_city_candidates(&names, &type_rank, &population, &lat, &lon);
assert_eq!(
cities.iter().map(|city| city.name).collect::<Vec<_>>(),
["London", "Cambridge", "Oxford"]
);
}
#[test]
fn nearest_display_city_labels_inner_greater_london_from_london_candidate() {
let cities = test_city_candidates();
assert_eq!(
nearest_display_city(51.371_304, -0.101957, &cities),
Some("London")
);
}
#[test]
fn nearest_display_city_preserves_non_london_duplicates() {
let cities = test_city_candidates();
assert_eq!(
nearest_display_city(52.127_77, -0.0813098, &cities),
Some("Cambridge")
);
}
#[test]
fn nearest_display_city_does_not_extend_london_past_its_display_radius() {
let cities = test_city_candidates();
assert_eq!(nearest_display_city(51.5093, -0.5954, &cities), None);
}
#[test]
fn nearest_display_city_keeps_normal_non_london_city() {
let cities = test_city_candidates();
assert_eq!(
nearest_display_city(51.456659, -0.969651, &cities),
Some("Oxford")
);
}
}