This commit is contained in:
Andras Schmelczer 2026-02-15 22:39:49 +00:00
parent 03445188ea
commit 524580eb25
102 changed files with 36625 additions and 1295 deletions

View file

@ -0,0 +1,168 @@
use std::path::Path;
use anyhow::Context;
use polars::frame::DataFrame;
use polars::lazy::frame::LazyFrame;
use polars::prelude::*;
use tracing::info;
use crate::utils::InternedColumn;
pub struct PlaceData {
pub name: Vec<String>,
pub name_lower: Vec<String>,
pub place_type: InternedColumn,
pub type_rank: Vec<u8>,
pub population: Vec<u32>,
pub lat: Vec<f32>,
pub lon: Vec<f32>,
pub city: Vec<Option<String>>,
}
fn type_rank(place_type: &str) -> u8 {
match place_type {
"city" => 0,
"borough" => 1,
"town" => 2,
"suburb" => 3,
"quarter" => 4,
"neighbourhood" => 5,
"village" => 6,
"station" => 7,
"island" => 8,
"hamlet" => 9,
"locality" => 10,
"isolated_dwelling" => 11,
_ => 12,
}
}
fn extract_str_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<String>> {
let column = df
.column(name)
.with_context(|| format!("Missing column '{name}' in places data"))?;
let string_column = column
.str()
.with_context(|| format!("Column '{name}' is not a string column"))?;
Ok(string_column
.into_iter()
.map(|value| value.unwrap_or("").to_string())
.collect())
}
fn extract_f32_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<f32>> {
let column = df
.column(name)
.with_context(|| format!("Missing column '{name}' in places data"))?;
let cast = column
.cast(&DataType::Float32)
.with_context(|| format!("Failed to cast column '{name}' to Float32"))?;
let float_column = cast
.f32()
.with_context(|| format!("Column '{name}' is not a float32 column"))?;
Ok(float_column
.into_iter()
.map(|value| value.unwrap_or(0.0))
.collect())
}
impl PlaceData {
pub fn load(parquet_path: &Path) -> anyhow::Result<Self> {
info!("Loading place data from {:?}...", parquet_path);
let df = LazyFrame::scan_parquet(parquet_path, Default::default())
.context("Failed to scan places parquet")?
.collect()
.context("Failed to read places parquet")?;
let row_count = df.height();
info!("Loaded {} places", row_count);
let name = extract_str_col(&df, "name")?;
let place_type_raw = extract_str_col(&df, "place_type")?;
let lat = extract_f32_col(&df, "lat")?;
let lon = extract_f32_col(&df, "lon")?;
let population: Vec<u32> = if df.column("population").is_ok() {
let pop_f32 = extract_f32_col(&df, "population")?;
pop_f32.iter().map(|&val| val.max(0.0) as u32).collect()
} else {
vec![0; row_count]
};
let name_lower: Vec<String> = name.iter().map(|nm| nm.to_lowercase()).collect();
let type_rank_vec: Vec<u8> = place_type_raw.iter().map(|pt| type_rank(pt)).collect();
let place_type = InternedColumn::build(&place_type_raw);
// Precompute nearest city for each non-city place
let city_indices: Vec<usize> = type_rank_vec
.iter()
.enumerate()
.filter_map(|(idx, &rank)| if rank == 0 { Some(idx) } else { None })
.collect();
let city: Vec<Option<String>> = (0..row_count)
.map(|idx| {
if type_rank_vec[idx] == 0 {
return None; // Cities don't need a city label
}
let plat = lat[idx];
let plon = lon[idx];
let cos_lat = (plat.to_radians()).cos();
let mut best_dist_sq = f32::MAX;
let mut best_city: Option<&str> = None;
for &ci in &city_indices {
let dlat = lat[ci] - plat;
let dlon = (lon[ci] - plon) * cos_lat;
let dist_sq = dlat * dlat + dlon * dlon;
if dist_sq < best_dist_sq {
best_dist_sq = dist_sq;
best_city = Some(&name[ci]);
}
}
// ~100km threshold: 1° ≈ 111km, so 0.9° ≈ 100km → 0.81 squared
if best_dist_sq < 0.81 {
best_city.map(|s| s.to_string())
} else {
None
}
})
.collect();
let with_pop = population.iter().filter(|&&pop| pop > 0).count();
let with_city = city.iter().filter(|c| c.is_some()).count();
info!(
places = row_count,
types = place_type.values.len(),
with_population = with_pop,
with_city = with_city,
"Place data loaded"
);
Ok(PlaceData {
name,
name_lower,
place_type,
type_rank: type_rank_vec,
population,
lat,
lon,
city,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn type_rank_ordering() {
assert!(type_rank("city") < type_rank("town"));
assert!(type_rank("town") < type_rank("suburb"));
assert!(type_rank("suburb") < type_rank("village"));
assert!(type_rank("village") < type_rank("hamlet"));
assert!(type_rank("hamlet") < type_rank("isolated_dwelling"));
}
}