perfect-postcode/server-rs/src/data/poi.rs
2026-03-15 17:38:26 +00:00

178 lines
6.4 KiB
Rust

use std::collections::{HashMap, HashSet};
use std::path::Path;
use anyhow::{bail, Context};
use polars::frame::DataFrame;
use polars::lazy::frame::LazyFrame;
use polars::prelude::*;
use serde::Serialize;
use tracing::info;
use crate::features::POI_GROUP_ORDER;
use crate::utils::{generate_priorities, InternedColumn};
#[derive(Serialize, Clone)]
pub struct POICategoryGroup {
pub name: String,
pub categories: Vec<String>,
}
pub struct POIData {
/// Contiguous buffer holding all POI ID strings end-to-end.
id_buffer: String,
/// Byte offset into `id_buffer` where each row's ID starts.
id_offsets: Vec<u32>,
/// Length in bytes of each row's ID.
id_lengths: Vec<u8>,
pub group: InternedColumn,
pub category: InternedColumn,
pub name: Vec<String>,
pub lat: Vec<f32>,
pub lng: Vec<f32>,
pub emoji: InternedColumn,
/// Deterministic pseudo-random priority per row, used to select a spatially
/// uniform subset when the POI count exceeds the per-request limit.
/// Computed once at load time so the same POIs are always chosen for a given viewport.
pub priority: Vec<u32>,
}
impl POIData {
/// Get the ID string for a given row.
pub fn id(&self, row: usize) -> &str {
let offset = self.id_offsets[row] as usize;
let length = self.id_lengths[row] as usize;
&self.id_buffer[offset..offset + length]
}
}
fn extract_str_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<String>> {
let column = df
.column(name)
.with_context(|| format!("Missing column '{name}' in POI data"))?;
let string_column = column
.str()
.with_context(|| format!("Column '{name}' is not a string column"))?;
Ok(string_column
.into_iter()
.map(|value| value.unwrap_or("").to_string())
.collect())
}
fn extract_f32_col(df: &DataFrame, name: &str, default: f32) -> anyhow::Result<Vec<f32>> {
let column = df
.column(name)
.with_context(|| format!("Missing column '{name}' in POI data"))?;
let cast = column
.cast(&DataType::Float32)
.with_context(|| format!("Failed to cast column '{name}' to Float32"))?;
let float_column = cast
.f32()
.with_context(|| format!("Column '{name}' is not a float32 column"))?;
Ok(float_column
.into_iter()
.map(|value| value.unwrap_or(default))
.collect())
}
impl POIData {
pub fn load(parquet_path: &Path) -> anyhow::Result<Self> {
info!("Loading POI data from {:?}...", parquet_path);
let df = LazyFrame::scan_parquet(parquet_path, Default::default())
.context("Failed to scan POI parquet")?
.collect()
.context("Failed to read POI parquet")?;
let row_count = df.height();
info!("Loaded {} POIs", row_count);
let id_raw: Vec<String> = extract_str_col(&df, "id")?;
let name = extract_str_col(&df, "name")?;
let category_raw = extract_str_col(&df, "category")?;
let group_raw = extract_str_col(&df, "group")?;
let lat = extract_f32_col(&df, "lat", 0.0)?;
let lng = extract_f32_col(&df, "lng", 0.0)?;
let emoji_raw = extract_str_col(&df, "emoji")?;
// Pack POI IDs into a contiguous buffer
let total_id_bytes: usize = id_raw.iter().map(|s| s.len()).sum();
let mut id_buffer = String::with_capacity(total_id_bytes);
let mut id_offsets = Vec::with_capacity(row_count);
let mut id_lengths = Vec::with_capacity(row_count);
for s in &id_raw {
let offset = id_buffer.len() as u32;
let length = s.len().min(u8::MAX as usize) as u8;
id_offsets.push(offset);
id_lengths.push(length);
id_buffer.push_str(&s[..length as usize]);
}
let category = InternedColumn::build(&category_raw);
let group = InternedColumn::build(&group_raw);
let emoji = InternedColumn::build(&emoji_raw);
info!(
category_unique = category.values.len(),
group_unique = group.values.len(),
emoji_unique = emoji.values.len(),
"POI string columns interned"
);
// Assign a deterministic pseudo-random priority to each row.
// This ensures the same POIs are selected across requests,
// preventing visual "shuffling" when panning the map.
let priority = generate_priorities(row_count);
info!("POI data loading complete.");
Ok(POIData {
id_buffer,
id_offsets,
id_lengths,
name,
category,
group,
lat,
lng,
emoji,
priority,
})
}
/// Build category groups from the loaded POI data, validated against POI_GROUP_ORDER.
pub fn category_groups(&self) -> anyhow::Result<Vec<POICategoryGroup>> {
let mut group_cats: HashMap<String, HashSet<String>> = HashMap::new();
let num_pois = self.category.indices.len();
for row in 0..num_pois {
let category = self.category.get(row).to_string();
let group = self.group.get(row).to_string();
group_cats.entry(group).or_default().insert(category);
}
// Validate that data groups match the hardcoded order exactly
let expected: HashSet<&str> = POI_GROUP_ORDER.iter().copied().collect();
let actual: HashSet<&str> = group_cats.keys().map(|key| key.as_str()).collect();
let missing_from_data: Vec<&&str> = expected.difference(&actual).collect();
let missing_from_order: Vec<&&str> = actual.difference(&expected).collect();
if !missing_from_data.is_empty() || !missing_from_order.is_empty() {
bail!(
"POI group mismatch!\n In POI_GROUP_ORDER but not in data: {:?}\n In data but not in POI_GROUP_ORDER: {:?}",
missing_from_data, missing_from_order
);
}
POI_GROUP_ORDER
.iter()
.map(|group_name| {
let name = group_name.to_string();
let mut categories: Vec<String> = group_cats
.remove(&name)
.context("POI group validated but missing from map")?
.into_iter()
.collect();
categories.sort();
Ok(POICategoryGroup { name, categories })
})
.collect()
}
}