540 lines
16 KiB
Rust
540 lines
16 KiB
Rust
use std::path::Path;
|
||
|
||
use anyhow::Context;
|
||
use polars::frame::DataFrame;
|
||
use polars::lazy::frame::LazyFrame;
|
||
use polars::prelude::*;
|
||
use tracing::info;
|
||
|
||
use crate::utils::InternedColumn;
|
||
|
||
pub struct PlaceData {
|
||
pub name: Vec<String>,
|
||
pub name_lower: Vec<String>,
|
||
pub name_search: Vec<String>,
|
||
pub place_type: InternedColumn,
|
||
pub type_rank: Vec<u8>,
|
||
pub population: Vec<u32>,
|
||
pub lat: Vec<f32>,
|
||
pub lon: Vec<f32>,
|
||
pub city: Vec<Option<String>>,
|
||
pub travel_destination: Vec<bool>,
|
||
}
|
||
|
||
#[derive(Clone, Copy)]
|
||
pub(super) struct CityCandidate<'a> {
|
||
name: &'a str,
|
||
lat: f32,
|
||
lon: f32,
|
||
population: u32,
|
||
max_dist_sq: f32,
|
||
}
|
||
|
||
const PARENT_CITY_MAX_DIST_SQ: f32 = 0.81;
|
||
const LONDON_DISPLAY_MAX_DEGREES: f32 = 30.0 / 111.0;
|
||
const LONDON_DISPLAY_MAX_DIST_SQ: f32 = LONDON_DISPLAY_MAX_DEGREES * LONDON_DISPLAY_MAX_DEGREES;
|
||
const SUBSUMED_CITY_MAX_DEGREES: f32 = 5.0 / 111.0;
|
||
const SUBSUMED_CITY_MAX_DIST_SQ: f32 = SUBSUMED_CITY_MAX_DEGREES * SUBSUMED_CITY_MAX_DEGREES;
|
||
const SUBSUMED_CITY_MIN_POPULATION_RATIO: u32 = 10;
|
||
|
||
fn type_rank(place_type: &str) -> u8 {
|
||
match place_type {
|
||
"city" => 0,
|
||
"town" => 1,
|
||
"village" => 2,
|
||
"suburb" | "neighbourhood" | "quarter" | "borough" | "locality" => 3,
|
||
"station" | "university" => 4,
|
||
"hamlet" | "isolated_dwelling" | "island" => 5,
|
||
_ => 6,
|
||
}
|
||
}
|
||
|
||
pub fn is_travel_destination_type(place_type: &str) -> bool {
|
||
matches!(place_type, "city" | "station" | "university")
|
||
}
|
||
|
||
impl<'a> CityCandidate<'a> {
|
||
fn from_place(name: &'a str, lat: f32, lon: f32, population: u32) -> Self {
|
||
let max_dist_sq = if name == "London" {
|
||
LONDON_DISPLAY_MAX_DIST_SQ
|
||
} else {
|
||
PARENT_CITY_MAX_DIST_SQ
|
||
};
|
||
|
||
Self {
|
||
name,
|
||
lat,
|
||
lon,
|
||
population,
|
||
max_dist_sq,
|
||
}
|
||
}
|
||
|
||
fn distance_sq(&self, lat: f32, lon: f32, cos_lat: f32) -> f32 {
|
||
let dlat = self.lat - lat;
|
||
let dlon = (self.lon - lon) * cos_lat;
|
||
dlat * dlat + dlon * dlon
|
||
}
|
||
|
||
fn is_subsumed_by(&self, other: &Self) -> bool {
|
||
if self.population == 0 {
|
||
return false;
|
||
}
|
||
|
||
let min_parent_population =
|
||
u64::from(self.population) * u64::from(SUBSUMED_CITY_MIN_POPULATION_RATIO);
|
||
if u64::from(other.population) < min_parent_population {
|
||
return false;
|
||
}
|
||
|
||
other.distance_sq(self.lat, self.lon, self.lat.to_radians().cos())
|
||
< SUBSUMED_CITY_MAX_DIST_SQ
|
||
}
|
||
}
|
||
|
||
pub(super) fn display_city_candidates<'a>(
|
||
names: &'a [String],
|
||
type_rank: &[u8],
|
||
population: &[u32],
|
||
lat: &[f32],
|
||
lon: &[f32],
|
||
) -> Vec<CityCandidate<'a>> {
|
||
let cities: Vec<CityCandidate<'_>> = type_rank
|
||
.iter()
|
||
.enumerate()
|
||
.filter_map(|(idx, &rank)| {
|
||
if rank == 0 {
|
||
Some(CityCandidate::from_place(
|
||
&names[idx],
|
||
lat[idx],
|
||
lon[idx],
|
||
population[idx],
|
||
))
|
||
} else {
|
||
None
|
||
}
|
||
})
|
||
.collect();
|
||
|
||
cities
|
||
.iter()
|
||
.enumerate()
|
||
.filter_map(|(idx, city)| {
|
||
let is_subsumed = cities
|
||
.iter()
|
||
.enumerate()
|
||
.any(|(other_idx, other)| other_idx != idx && city.is_subsumed_by(other));
|
||
(!is_subsumed).then_some(*city)
|
||
})
|
||
.collect()
|
||
}
|
||
|
||
pub(super) fn nearest_display_city<'a>(
|
||
lat: f32,
|
||
lon: f32,
|
||
cities: &'a [CityCandidate<'a>],
|
||
) -> Option<&'a str> {
|
||
let cos_lat = lat.to_radians().cos();
|
||
let (best_city, best_dist_sq) = cities
|
||
.iter()
|
||
.map(|city| (city, city.distance_sq(lat, lon, cos_lat)))
|
||
.min_by(|(_, lhs), (_, rhs)| lhs.total_cmp(rhs))?;
|
||
|
||
(best_dist_sq < best_city.max_dist_sq).then_some(best_city.name)
|
||
}
|
||
|
||
pub fn normalize_search_text(text: &str) -> String {
|
||
let mut result = String::with_capacity(text.len());
|
||
let mut last_was_space = true;
|
||
|
||
for ch in text.chars() {
|
||
if ch == '\'' || ch == '’' || ch == '`' {
|
||
continue;
|
||
}
|
||
|
||
let lower = ch.to_ascii_lowercase();
|
||
if lower.is_ascii_alphanumeric() {
|
||
result.push(lower);
|
||
last_was_space = false;
|
||
} else if !last_was_space {
|
||
result.push(' ');
|
||
last_was_space = true;
|
||
}
|
||
}
|
||
|
||
if result.ends_with(' ') {
|
||
result.pop();
|
||
}
|
||
result
|
||
}
|
||
|
||
fn replace_token(text: &str, from: &str, to: &str) -> Option<String> {
|
||
let mut changed = false;
|
||
let replaced: Vec<&str> = text
|
||
.split_whitespace()
|
||
.map(|token| {
|
||
if token == from {
|
||
changed = true;
|
||
to
|
||
} else {
|
||
token
|
||
}
|
||
})
|
||
.collect();
|
||
|
||
changed.then(|| replaced.join(" "))
|
||
}
|
||
|
||
fn push_alias(aliases: &mut Vec<String>, alias: String) {
|
||
if !alias.is_empty() && !aliases.iter().any(|existing| existing == &alias) {
|
||
aliases.push(alias);
|
||
}
|
||
}
|
||
|
||
fn build_search_text(name: &str, place_type: &str) -> String {
|
||
let primary = normalize_search_text(name);
|
||
let mut aliases = vec![primary.clone()];
|
||
|
||
if let Some(alias) = replace_token(&primary, "st", "saint") {
|
||
push_alias(&mut aliases, alias);
|
||
}
|
||
if let Some(alias) = replace_token(&primary, "saint", "st") {
|
||
push_alias(&mut aliases, alias);
|
||
}
|
||
|
||
if place_type == "station" {
|
||
let suffix_aliases: [(&str, &[&str]); 6] = [
|
||
(
|
||
" tube station",
|
||
&[" underground station", " station", " tube", " underground"],
|
||
),
|
||
(
|
||
" underground station",
|
||
&[" tube station", " station", " tube", " underground"],
|
||
),
|
||
(
|
||
" railway station",
|
||
&[" rail station", " station", " railway", " rail"],
|
||
),
|
||
(
|
||
" overground station",
|
||
&[" station", " overground", " railway station"],
|
||
),
|
||
(
|
||
" elizabeth line station",
|
||
&[" station", " elizabeth line", " crossrail station"],
|
||
),
|
||
(" dlr station", &[" station", " dlr"]),
|
||
];
|
||
|
||
for (suffix, replacements) in suffix_aliases {
|
||
if let Some(stem) = primary.strip_suffix(suffix) {
|
||
for replacement in replacements {
|
||
push_alias(&mut aliases, format!("{stem}{replacement}"));
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
aliases.join(" | ")
|
||
}
|
||
|
||
fn extract_str_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<String>> {
|
||
let column = df
|
||
.column(name)
|
||
.with_context(|| format!("Missing column '{name}' in places data"))?;
|
||
let string_column = column
|
||
.str()
|
||
.with_context(|| format!("Column '{name}' is not a string column"))?;
|
||
string_column
|
||
.into_iter()
|
||
.enumerate()
|
||
.map(|(row, value)| {
|
||
value
|
||
.map(ToString::to_string)
|
||
.with_context(|| format!("Column '{name}' has null at row {row}"))
|
||
})
|
||
.collect()
|
||
}
|
||
|
||
fn extract_f32_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<f32>> {
|
||
let column = df
|
||
.column(name)
|
||
.with_context(|| format!("Missing column '{name}' in places data"))?;
|
||
let cast = column
|
||
.cast(&DataType::Float32)
|
||
.with_context(|| format!("Failed to cast column '{name}' to Float32"))?;
|
||
let float_column = cast
|
||
.f32()
|
||
.with_context(|| format!("Column '{name}' is not a float32 column"))?;
|
||
float_column
|
||
.into_iter()
|
||
.enumerate()
|
||
.map(|(row, value)| value.with_context(|| format!("Column '{name}' has null at row {row}")))
|
||
.collect()
|
||
}
|
||
|
||
fn extract_bool_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<bool>> {
|
||
let column = df
|
||
.column(name)
|
||
.with_context(|| format!("Missing column '{name}' in places data"))?;
|
||
let bool_column = column
|
||
.bool()
|
||
.with_context(|| format!("Column '{name}' is not a boolean column"))?;
|
||
bool_column
|
||
.into_iter()
|
||
.enumerate()
|
||
.map(|(row, value)| value.with_context(|| format!("Column '{name}' has null at row {row}")))
|
||
.collect()
|
||
}
|
||
|
||
fn extract_optional_str_col(
|
||
df: &DataFrame,
|
||
name: &str,
|
||
) -> anyhow::Result<Option<Vec<Option<String>>>> {
|
||
let column = match df.column(name) {
|
||
Ok(column) => column,
|
||
Err(_) => return Ok(None),
|
||
};
|
||
let string_column = column
|
||
.str()
|
||
.with_context(|| format!("Column '{name}' is not a string column"))?;
|
||
Ok(Some(
|
||
string_column
|
||
.into_iter()
|
||
.map(|value| value.map(ToString::to_string))
|
||
.collect(),
|
||
))
|
||
}
|
||
|
||
impl PlaceData {
|
||
pub fn load(parquet_path: &Path) -> anyhow::Result<Self> {
|
||
super::run_polars_io(|| Self::load_inner(parquet_path))
|
||
}
|
||
|
||
fn load_inner(parquet_path: &Path) -> anyhow::Result<Self> {
|
||
info!("Loading place data from {:?}...", parquet_path);
|
||
|
||
let parquet_path = PlRefPath::try_from_path(parquet_path)
|
||
.context("Failed to normalize places parquet path")?;
|
||
let df = LazyFrame::scan_parquet(parquet_path, Default::default())
|
||
.context("Failed to scan places parquet")?
|
||
.collect()
|
||
.context("Failed to read places parquet")?;
|
||
|
||
let row_count = df.height();
|
||
info!("Loaded {} places", row_count);
|
||
|
||
let name = extract_str_col(&df, "name")?;
|
||
let place_type_raw = extract_str_col(&df, "place_type")?;
|
||
let lat = extract_f32_col(&df, "lat")?;
|
||
let lon = extract_f32_col(&df, "lon")?;
|
||
let population: Vec<u32> = if df.column("population").is_ok() {
|
||
let pop_f32 = extract_f32_col(&df, "population")?;
|
||
pop_f32
|
||
.iter()
|
||
.map(|&val| val.max(0.0).min(u32::MAX as f32) as u32)
|
||
.collect()
|
||
} else {
|
||
vec![0; row_count]
|
||
};
|
||
|
||
let name_lower: Vec<String> = name.iter().map(|nm| nm.to_lowercase()).collect();
|
||
let name_search: Vec<String> = name
|
||
.iter()
|
||
.zip(&place_type_raw)
|
||
.map(|(nm, pt)| build_search_text(nm, pt))
|
||
.collect();
|
||
let type_rank_vec: Vec<u8> = place_type_raw.iter().map(|pt| type_rank(pt)).collect();
|
||
let place_type = InternedColumn::build(&place_type_raw);
|
||
let travel_destination = if df.column("travel_destination").is_ok() {
|
||
extract_bool_col(&df, "travel_destination")?
|
||
} else {
|
||
place_type_raw
|
||
.iter()
|
||
.map(|place_type| is_travel_destination_type(place_type))
|
||
.collect()
|
||
};
|
||
let display_city_override = extract_optional_str_col(&df, "display_city")?;
|
||
|
||
// Precompute nearest city for each non-city place
|
||
let city_candidates =
|
||
display_city_candidates(&name, &type_rank_vec, &population, &lat, &lon);
|
||
|
||
let fallback_city: Vec<Option<String>> = (0..row_count)
|
||
.map(|idx| {
|
||
if type_rank_vec[idx] == 0 {
|
||
return None; // Cities don't need a city label
|
||
}
|
||
nearest_display_city(lat[idx], lon[idx], &city_candidates).map(str::to_string)
|
||
})
|
||
.collect();
|
||
|
||
let city: Vec<Option<String>> = if let Some(display_city_override) = display_city_override {
|
||
fallback_city
|
||
.into_iter()
|
||
.zip(display_city_override)
|
||
.enumerate()
|
||
.map(|(idx, (fallback, override_city))| {
|
||
if type_rank_vec[idx] == 0 {
|
||
return None;
|
||
}
|
||
override_city
|
||
.and_then(|value| {
|
||
let trimmed = value.trim();
|
||
(!trimmed.is_empty()).then(|| trimmed.to_string())
|
||
})
|
||
.or(fallback)
|
||
})
|
||
.collect()
|
||
} else {
|
||
fallback_city
|
||
};
|
||
|
||
let with_pop = population.iter().filter(|&&pop| pop > 0).count();
|
||
let with_city = city.iter().filter(|c| c.is_some()).count();
|
||
info!(
|
||
places = row_count,
|
||
types = place_type.values.len(),
|
||
with_population = with_pop,
|
||
with_city = with_city,
|
||
"Place data loaded"
|
||
);
|
||
|
||
Ok(PlaceData {
|
||
name,
|
||
name_lower,
|
||
name_search,
|
||
place_type,
|
||
type_rank: type_rank_vec,
|
||
population,
|
||
lat,
|
||
lon,
|
||
city,
|
||
travel_destination,
|
||
})
|
||
}
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
|
||
fn test_city_rows() -> [(&'static str, f32, f32, u32); 5] {
|
||
[
|
||
("London", 51.507_446, -0.1277653, 8_908_083),
|
||
("Westminster", 51.497_322, -0.137149, 211_365),
|
||
("City of London", 51.515_617, -0.0919983, 10_847),
|
||
("Cambridge", 52.205_532, 0.1186637, 145_818),
|
||
("Oxford", 51.752_014, -1.2578499, 165_000),
|
||
]
|
||
}
|
||
|
||
fn all_test_city_candidates() -> Vec<CityCandidate<'static>> {
|
||
test_city_rows()
|
||
.into_iter()
|
||
.map(|(name, lat, lon, population)| {
|
||
CityCandidate::from_place(name, lat, lon, population)
|
||
})
|
||
.collect()
|
||
}
|
||
|
||
fn test_city_candidates() -> Vec<CityCandidate<'static>> {
|
||
let cities = all_test_city_candidates();
|
||
|
||
cities
|
||
.iter()
|
||
.enumerate()
|
||
.filter_map(|(idx, city)| {
|
||
let is_subsumed = cities
|
||
.iter()
|
||
.enumerate()
|
||
.any(|(other_idx, other)| other_idx != idx && city.is_subsumed_by(other));
|
||
(!is_subsumed).then_some(*city)
|
||
})
|
||
.collect()
|
||
}
|
||
|
||
#[test]
|
||
fn type_rank_ordering() {
|
||
assert!(type_rank("city") < type_rank("town"));
|
||
assert!(type_rank("town") < type_rank("station"));
|
||
assert!(type_rank("station") < type_rank("unknown"));
|
||
}
|
||
|
||
#[test]
|
||
fn search_text_handles_common_address_variants() {
|
||
assert!(build_search_text("King's Cross tube station", "station")
|
||
.contains("kings cross underground"));
|
||
assert!(build_search_text("St Albans", "city").contains("saint albans"));
|
||
assert!(build_search_text("Shadwell DLR station", "station").contains("shadwell station"));
|
||
}
|
||
|
||
#[test]
|
||
fn travel_destination_types_match_legacy_places() {
|
||
assert!(is_travel_destination_type("city"));
|
||
assert!(is_travel_destination_type("station"));
|
||
assert!(!is_travel_destination_type("town"));
|
||
assert!(!is_travel_destination_type("suburb"));
|
||
}
|
||
|
||
#[test]
|
||
fn display_city_candidates_drop_city_nodes_subsumed_by_much_larger_nearby_city() {
|
||
let rows = test_city_rows();
|
||
let names: Vec<String> = rows
|
||
.iter()
|
||
.map(|(name, _, _, _)| name.to_string())
|
||
.collect();
|
||
let type_rank: Vec<u8> = vec![0; rows.len()];
|
||
let population: Vec<u32> = rows
|
||
.iter()
|
||
.map(|(_, _, _, population)| *population)
|
||
.collect();
|
||
let lat: Vec<f32> = rows.iter().map(|(_, lat, _, _)| *lat).collect();
|
||
let lon: Vec<f32> = rows.iter().map(|(_, _, lon, _)| *lon).collect();
|
||
|
||
let cities = display_city_candidates(&names, &type_rank, &population, &lat, &lon);
|
||
|
||
assert_eq!(
|
||
cities.iter().map(|city| city.name).collect::<Vec<_>>(),
|
||
["London", "Cambridge", "Oxford"]
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn nearest_display_city_labels_inner_greater_london_from_london_candidate() {
|
||
let cities = test_city_candidates();
|
||
|
||
assert_eq!(
|
||
nearest_display_city(51.371_304, -0.101957, &cities),
|
||
Some("London")
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn nearest_display_city_preserves_non_london_duplicates() {
|
||
let cities = test_city_candidates();
|
||
|
||
assert_eq!(
|
||
nearest_display_city(52.127_77, -0.0813098, &cities),
|
||
Some("Cambridge")
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn nearest_display_city_does_not_extend_london_past_its_display_radius() {
|
||
let cities = test_city_candidates();
|
||
|
||
assert_eq!(nearest_display_city(51.5093, -0.5954, &cities), None);
|
||
}
|
||
|
||
#[test]
|
||
fn nearest_display_city_keeps_normal_non_london_city() {
|
||
let cities = test_city_candidates();
|
||
|
||
assert_eq!(
|
||
nearest_display_city(51.456659, -0.969651, &cities),
|
||
Some("Oxford")
|
||
);
|
||
}
|
||
}
|