lmao
This commit is contained in:
parent
03445188ea
commit
524580eb25
102 changed files with 36625 additions and 1295 deletions
168
server-rs/src/data/places.rs
Normal file
168
server-rs/src/data/places.rs
Normal file
|
|
@ -0,0 +1,168 @@
|
|||
use std::path::Path;
|
||||
|
||||
use anyhow::Context;
|
||||
use polars::frame::DataFrame;
|
||||
use polars::lazy::frame::LazyFrame;
|
||||
use polars::prelude::*;
|
||||
use tracing::info;
|
||||
|
||||
use crate::utils::InternedColumn;
|
||||
|
||||
pub struct PlaceData {
|
||||
pub name: Vec<String>,
|
||||
pub name_lower: Vec<String>,
|
||||
pub place_type: InternedColumn,
|
||||
pub type_rank: Vec<u8>,
|
||||
pub population: Vec<u32>,
|
||||
pub lat: Vec<f32>,
|
||||
pub lon: Vec<f32>,
|
||||
pub city: Vec<Option<String>>,
|
||||
}
|
||||
|
||||
fn type_rank(place_type: &str) -> u8 {
|
||||
match place_type {
|
||||
"city" => 0,
|
||||
"borough" => 1,
|
||||
"town" => 2,
|
||||
"suburb" => 3,
|
||||
"quarter" => 4,
|
||||
"neighbourhood" => 5,
|
||||
"village" => 6,
|
||||
"station" => 7,
|
||||
"island" => 8,
|
||||
"hamlet" => 9,
|
||||
"locality" => 10,
|
||||
"isolated_dwelling" => 11,
|
||||
_ => 12,
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_str_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<String>> {
|
||||
let column = df
|
||||
.column(name)
|
||||
.with_context(|| format!("Missing column '{name}' in places data"))?;
|
||||
let string_column = column
|
||||
.str()
|
||||
.with_context(|| format!("Column '{name}' is not a string column"))?;
|
||||
Ok(string_column
|
||||
.into_iter()
|
||||
.map(|value| value.unwrap_or("").to_string())
|
||||
.collect())
|
||||
}
|
||||
|
||||
fn extract_f32_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<f32>> {
|
||||
let column = df
|
||||
.column(name)
|
||||
.with_context(|| format!("Missing column '{name}' in places data"))?;
|
||||
let cast = column
|
||||
.cast(&DataType::Float32)
|
||||
.with_context(|| format!("Failed to cast column '{name}' to Float32"))?;
|
||||
let float_column = cast
|
||||
.f32()
|
||||
.with_context(|| format!("Column '{name}' is not a float32 column"))?;
|
||||
Ok(float_column
|
||||
.into_iter()
|
||||
.map(|value| value.unwrap_or(0.0))
|
||||
.collect())
|
||||
}
|
||||
|
||||
impl PlaceData {
|
||||
pub fn load(parquet_path: &Path) -> anyhow::Result<Self> {
|
||||
info!("Loading place data from {:?}...", parquet_path);
|
||||
|
||||
let df = LazyFrame::scan_parquet(parquet_path, Default::default())
|
||||
.context("Failed to scan places parquet")?
|
||||
.collect()
|
||||
.context("Failed to read places parquet")?;
|
||||
|
||||
let row_count = df.height();
|
||||
info!("Loaded {} places", row_count);
|
||||
|
||||
let name = extract_str_col(&df, "name")?;
|
||||
let place_type_raw = extract_str_col(&df, "place_type")?;
|
||||
let lat = extract_f32_col(&df, "lat")?;
|
||||
let lon = extract_f32_col(&df, "lon")?;
|
||||
let population: Vec<u32> = if df.column("population").is_ok() {
|
||||
let pop_f32 = extract_f32_col(&df, "population")?;
|
||||
pop_f32.iter().map(|&val| val.max(0.0) as u32).collect()
|
||||
} else {
|
||||
vec![0; row_count]
|
||||
};
|
||||
|
||||
let name_lower: Vec<String> = name.iter().map(|nm| nm.to_lowercase()).collect();
|
||||
let type_rank_vec: Vec<u8> = place_type_raw.iter().map(|pt| type_rank(pt)).collect();
|
||||
let place_type = InternedColumn::build(&place_type_raw);
|
||||
|
||||
// Precompute nearest city for each non-city place
|
||||
let city_indices: Vec<usize> = type_rank_vec
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(idx, &rank)| if rank == 0 { Some(idx) } else { None })
|
||||
.collect();
|
||||
|
||||
let city: Vec<Option<String>> = (0..row_count)
|
||||
.map(|idx| {
|
||||
if type_rank_vec[idx] == 0 {
|
||||
return None; // Cities don't need a city label
|
||||
}
|
||||
let plat = lat[idx];
|
||||
let plon = lon[idx];
|
||||
let cos_lat = (plat.to_radians()).cos();
|
||||
|
||||
let mut best_dist_sq = f32::MAX;
|
||||
let mut best_city: Option<&str> = None;
|
||||
for &ci in &city_indices {
|
||||
let dlat = lat[ci] - plat;
|
||||
let dlon = (lon[ci] - plon) * cos_lat;
|
||||
let dist_sq = dlat * dlat + dlon * dlon;
|
||||
if dist_sq < best_dist_sq {
|
||||
best_dist_sq = dist_sq;
|
||||
best_city = Some(&name[ci]);
|
||||
}
|
||||
}
|
||||
|
||||
// ~100km threshold: 1° ≈ 111km, so 0.9° ≈ 100km → 0.81 squared
|
||||
if best_dist_sq < 0.81 {
|
||||
best_city.map(|s| s.to_string())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
let with_pop = population.iter().filter(|&&pop| pop > 0).count();
|
||||
let with_city = city.iter().filter(|c| c.is_some()).count();
|
||||
info!(
|
||||
places = row_count,
|
||||
types = place_type.values.len(),
|
||||
with_population = with_pop,
|
||||
with_city = with_city,
|
||||
"Place data loaded"
|
||||
);
|
||||
|
||||
Ok(PlaceData {
|
||||
name,
|
||||
name_lower,
|
||||
place_type,
|
||||
type_rank: type_rank_vec,
|
||||
population,
|
||||
lat,
|
||||
lon,
|
||||
city,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn type_rank_ordering() {
|
||||
assert!(type_rank("city") < type_rank("town"));
|
||||
assert!(type_rank("town") < type_rank("suburb"));
|
||||
assert!(type_rank("suburb") < type_rank("village"));
|
||||
assert!(type_rank("village") < type_rank("hamlet"));
|
||||
assert!(type_rank("hamlet") < type_rank("isolated_dwelling"));
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue