This commit is contained in:
Andras Schmelczer 2026-05-17 10:16:30 +01:00
parent 47d89f6fad
commit 017902b8e6
82 changed files with 331466 additions and 54841 deletions

View file

@ -21,6 +21,16 @@ pub struct PlaceData {
pub travel_destination: Vec<bool>,
}
pub(super) struct CityCandidate<'a> {
pub(super) name: &'a str,
pub(super) lat: f32,
pub(super) lon: f32,
}
const PARENT_CITY_MAX_DIST_SQ: f32 = 0.81;
const LONDON_DISPLAY_MAX_DEGREES: f32 = 30.0 / 111.0;
const LONDON_DISPLAY_MAX_DIST_SQ: f32 = LONDON_DISPLAY_MAX_DEGREES * LONDON_DISPLAY_MAX_DEGREES;
fn type_rank(place_type: &str) -> u8 {
match place_type {
"city" => 0,
@ -37,6 +47,53 @@ pub fn is_travel_destination_type(place_type: &str) -> bool {
matches!(place_type, "city" | "station" | "university")
}
fn distance_sq(lat: f32, lon: f32, city: &CityCandidate<'_>) -> f32 {
let cos_lat = lat.to_radians().cos();
let dlat = city.lat - lat;
let dlon = (city.lon - lon) * cos_lat;
dlat * dlat + dlon * dlon
}
fn is_london_city_name(name: &str) -> bool {
matches!(name, "London" | "Westminster" | "City of London")
}
pub(super) fn nearest_display_city<'a>(
lat: f32,
lon: f32,
cities: &'a [CityCandidate<'a>],
) -> Option<&'a str> {
let mut best_dist_sq = f32::MAX;
let mut best_city: Option<&CityCandidate<'_>> = None;
let mut london_dist_sq: Option<f32> = None;
for city in cities {
let dist_sq = distance_sq(lat, lon, city);
if city.name == "London" {
london_dist_sq = Some(dist_sq);
}
if dist_sq < best_dist_sq {
best_dist_sq = dist_sq;
best_city = Some(city);
}
}
let best_city = best_city?;
if best_dist_sq >= PARENT_CITY_MAX_DIST_SQ {
return None;
}
if is_london_city_name(best_city.name) {
if london_dist_sq.is_some_and(|dist_sq| dist_sq < LONDON_DISPLAY_MAX_DIST_SQ) {
Some("London")
} else {
None
}
} else {
Some(best_city.name)
}
}
pub fn normalize_search_text(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut last_was_space = true;
@ -182,6 +239,25 @@ fn extract_bool_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<bool>> {
.collect()
}
fn extract_optional_str_col(
df: &DataFrame,
name: &str,
) -> anyhow::Result<Option<Vec<Option<String>>>> {
let column = match df.column(name) {
Ok(column) => column,
Err(_) => return Ok(None),
};
let string_column = column
.str()
.with_context(|| format!("Column '{name}' is not a string column"))?;
Ok(Some(
string_column
.into_iter()
.map(|value| value.map(ToString::to_string))
.collect(),
))
}
impl PlaceData {
pub fn load(parquet_path: &Path) -> anyhow::Result<Self> {
super::run_polars_io(|| Self::load_inner(parquet_path))
@ -227,6 +303,7 @@ impl PlaceData {
.map(|place_type| is_travel_destination_type(place_type))
.collect()
};
let display_city_override = extract_optional_str_col(&df, "display_city")?;
// Precompute nearest city for each non-city place
let city_indices: Vec<usize> = type_rank_vec
@ -234,37 +311,45 @@ impl PlaceData {
.enumerate()
.filter_map(|(idx, &rank)| if rank == 0 { Some(idx) } else { None })
.collect();
let city_candidates: Vec<CityCandidate<'_>> = city_indices
.iter()
.map(|&idx| CityCandidate {
name: &name[idx],
lat: lat[idx],
lon: lon[idx],
})
.collect();
let city: Vec<Option<String>> = (0..row_count)
let fallback_city: Vec<Option<String>> = (0..row_count)
.map(|idx| {
if type_rank_vec[idx] == 0 {
return None; // Cities don't need a city label
}
let plat = lat[idx];
let plon = lon[idx];
let cos_lat = (plat.to_radians()).cos();
let mut best_dist_sq = f32::MAX;
let mut best_city: Option<&str> = None;
for &ci in &city_indices {
let dlat = lat[ci] - plat;
let dlon = (lon[ci] - plon) * cos_lat;
let dist_sq = dlat * dlat + dlon * dlon;
if dist_sq < best_dist_sq {
best_dist_sq = dist_sq;
best_city = Some(&name[ci]);
}
}
// ~100km threshold: 1° ≈ 111km, so 0.9° ≈ 100km → 0.81 squared
if best_dist_sq < 0.81 {
best_city.map(|s| s.to_string())
} else {
None
}
nearest_display_city(lat[idx], lon[idx], &city_candidates).map(str::to_string)
})
.collect();
let city: Vec<Option<String>> = if let Some(display_city_override) = display_city_override {
fallback_city
.into_iter()
.zip(display_city_override)
.enumerate()
.map(|(idx, (fallback, override_city))| {
if type_rank_vec[idx] == 0 {
return None;
}
override_city
.and_then(|value| {
let trimmed = value.trim();
(!trimmed.is_empty()).then(|| trimmed.to_string())
})
.or(fallback)
})
.collect()
} else {
fallback_city
};
let with_pop = population.iter().filter(|&&pop| pop > 0).count();
let with_city = city.iter().filter(|c| c.is_some()).count();
info!(
@ -294,6 +379,36 @@ impl PlaceData {
mod tests {
use super::*;
fn test_city_candidates() -> Vec<CityCandidate<'static>> {
vec![
CityCandidate {
name: "London",
lat: 51.5074456,
lon: -0.1277653,
},
CityCandidate {
name: "Westminster",
lat: 51.4973206,
lon: -0.137149,
},
CityCandidate {
name: "City of London",
lat: 51.5156177,
lon: -0.0919983,
},
CityCandidate {
name: "Cambridge",
lat: 52.2055314,
lon: 0.1186637,
},
CityCandidate {
name: "Oxford",
lat: 51.7520131,
lon: -1.2578499,
},
]
}
#[test]
fn type_rank_ordering() {
assert!(type_rank("city") < type_rank("town"));
@ -316,4 +431,41 @@ mod tests {
assert!(!is_travel_destination_type("town"));
assert!(!is_travel_destination_type("suburb"));
}
#[test]
fn nearest_display_city_canonicalizes_greater_london_aliases() {
let cities = test_city_candidates();
assert_eq!(
nearest_display_city(51.3713049, -0.101957, &cities),
Some("London")
);
}
#[test]
fn nearest_display_city_preserves_non_london_duplicates() {
let cities = test_city_candidates();
assert_eq!(
nearest_display_city(52.1277704, -0.0813098, &cities),
Some("Cambridge")
);
}
#[test]
fn nearest_display_city_does_not_leak_westminster_label_past_london_guard() {
let cities = test_city_candidates();
assert_eq!(nearest_display_city(51.5093, -0.5954, &cities), None);
}
#[test]
fn nearest_display_city_keeps_normal_non_london_city() {
let cities = test_city_candidates();
assert_eq!(
nearest_display_city(51.456659, -0.969651, &cities),
Some("Oxford")
);
}
}