344 lines
11 KiB
Rust
344 lines
11 KiB
Rust
use std::collections::{HashMap, HashSet};
|
|
use std::path::Path;
|
|
|
|
use anyhow::{bail, Context};
|
|
use polars::frame::DataFrame;
|
|
use polars::lazy::frame::LazyFrame;
|
|
use polars::prelude::*;
|
|
use rustc_hash::FxHashSet;
|
|
use serde::Serialize;
|
|
use tracing::info;
|
|
|
|
use crate::features::POI_GROUP_ORDER;
|
|
use crate::utils::{generate_priorities, InternedColumn};
|
|
|
|
#[derive(Serialize, Clone)]
|
|
pub struct POICategoryGroup {
|
|
pub name: String,
|
|
pub categories: Vec<String>,
|
|
}
|
|
|
|
const GROCERY_DASHBOARD_CATEGORIES: &[&str] = &[
|
|
"Supermarket",
|
|
"Convenience Store",
|
|
"Bakery",
|
|
"Greengrocer",
|
|
"Aldi",
|
|
"Amazon",
|
|
"Asda",
|
|
"Booths",
|
|
"Budgens",
|
|
"Centra",
|
|
"Co-op",
|
|
"COOK",
|
|
"Costco",
|
|
"Dunnes Stores",
|
|
"Farmfoods",
|
|
"Heron Foods",
|
|
"Iceland",
|
|
"Lidl",
|
|
"Makro",
|
|
"M&S",
|
|
"Morrisons",
|
|
"Planet Organic",
|
|
"Sainsbury's",
|
|
"Spar",
|
|
"Tesco",
|
|
"The Food Warehouse",
|
|
"Waitrose",
|
|
"Whole Foods Market",
|
|
];
|
|
|
|
const DASHBOARD_POI_GROUPS: &[(&str, &[&str])] = &[
|
|
(
|
|
"Public Transport",
|
|
&[
|
|
"Rail station",
|
|
"Tube station",
|
|
"Bus station",
|
|
"Bus stop",
|
|
"Airport",
|
|
],
|
|
),
|
|
("Groceries", GROCERY_DASHBOARD_CATEGORIES),
|
|
("Food & Drink", &["Café", "Restaurant", "Pub", "Fast Food"]),
|
|
("Green Space", &["Park", "Playground"]),
|
|
("Education", &["School"]),
|
|
(
|
|
"Health",
|
|
&["GP Surgery", "Pharmacy", "Dentist", "Hospital & Clinic"],
|
|
),
|
|
(
|
|
"Leisure",
|
|
&[
|
|
"Gym & Fitness",
|
|
"Sports Centre",
|
|
"Cinema",
|
|
"Theatre",
|
|
"Library",
|
|
],
|
|
),
|
|
(
|
|
"Practical",
|
|
&["Post Office", "Bank", "EV Charging", "Fuel Station"],
|
|
),
|
|
];
|
|
|
|
fn add_category_filter_index(
|
|
category_values: &[String],
|
|
category: &str,
|
|
selected: &mut FxHashSet<u16>,
|
|
) {
|
|
if let Some(pos) = category_values.iter().position(|value| value == category) {
|
|
selected.insert(pos as u16);
|
|
}
|
|
}
|
|
|
|
pub fn resolve_poi_category_filter(category_values: &[String], categories: &str) -> FxHashSet<u16> {
|
|
let mut selected = FxHashSet::default();
|
|
for part in categories.split(',') {
|
|
let category = part.trim();
|
|
if category.is_empty() {
|
|
continue;
|
|
}
|
|
add_category_filter_index(category_values, category, &mut selected);
|
|
}
|
|
selected
|
|
}
|
|
|
|
pub struct POIData {
|
|
/// Contiguous buffer holding all POI ID strings end-to-end.
|
|
id_buffer: String,
|
|
/// Byte offset into `id_buffer` where each row's ID starts.
|
|
id_offsets: Vec<u32>,
|
|
/// Length in bytes of each row's ID.
|
|
id_lengths: Vec<u16>,
|
|
pub group: InternedColumn,
|
|
pub category: InternedColumn,
|
|
pub icon_category: InternedColumn,
|
|
pub name: Vec<String>,
|
|
pub lat: Vec<f32>,
|
|
pub lng: Vec<f32>,
|
|
pub emoji: InternedColumn,
|
|
/// Deterministic pseudo-random priority per row, used to select a spatially
|
|
/// uniform subset when the POI count exceeds the per-request limit.
|
|
/// Computed once at load time so the same POIs are always chosen for a given viewport.
|
|
pub priority: Vec<u32>,
|
|
}
|
|
|
|
impl POIData {
|
|
/// Get the ID string for a given row.
|
|
pub fn id(&self, row: usize) -> &str {
|
|
let offset = self.id_offsets[row] as usize;
|
|
let length = self.id_lengths[row] as usize;
|
|
&self.id_buffer[offset..offset + length]
|
|
}
|
|
}
|
|
|
|
fn extract_str_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<String>> {
|
|
let column = df
|
|
.column(name)
|
|
.with_context(|| format!("Missing column '{name}' in POI data"))?;
|
|
let string_column = column
|
|
.str()
|
|
.with_context(|| format!("Column '{name}' is not a string column"))?;
|
|
string_column
|
|
.into_iter()
|
|
.enumerate()
|
|
.map(|(row, value)| {
|
|
value
|
|
.map(ToString::to_string)
|
|
.with_context(|| format!("Column '{name}' has null at row {row}"))
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
fn extract_f32_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<f32>> {
|
|
let column = df
|
|
.column(name)
|
|
.with_context(|| format!("Missing column '{name}' in POI data"))?;
|
|
let cast = column
|
|
.cast(&DataType::Float32)
|
|
.with_context(|| format!("Failed to cast column '{name}' to Float32"))?;
|
|
let float_column = cast
|
|
.f32()
|
|
.with_context(|| format!("Column '{name}' is not a float32 column"))?;
|
|
float_column
|
|
.into_iter()
|
|
.enumerate()
|
|
.map(|(row, value)| value.with_context(|| format!("Column '{name}' has null at row {row}")))
|
|
.collect()
|
|
}
|
|
|
|
impl POIData {
|
|
pub fn load(parquet_path: &Path) -> anyhow::Result<Self> {
|
|
super::run_polars_io(|| Self::load_inner(parquet_path))
|
|
}
|
|
|
|
fn load_inner(parquet_path: &Path) -> anyhow::Result<Self> {
|
|
info!("Loading POI data from {:?}...", parquet_path);
|
|
|
|
let parquet_path = PlRefPath::try_from_path(parquet_path)
|
|
.context("Failed to normalize POI parquet path")?;
|
|
let df = LazyFrame::scan_parquet(parquet_path, Default::default())
|
|
.context("Failed to scan POI parquet")?
|
|
.collect()
|
|
.context("Failed to read POI parquet")?;
|
|
|
|
let row_count = df.height();
|
|
info!("Loaded {} POIs", row_count);
|
|
|
|
let id_raw: Vec<String> = extract_str_col(&df, "id")?;
|
|
let name = extract_str_col(&df, "name")?;
|
|
let category_raw = extract_str_col(&df, "category")?;
|
|
let group_raw = extract_str_col(&df, "group")?;
|
|
let lat = extract_f32_col(&df, "lat")?;
|
|
let lng = extract_f32_col(&df, "lng")?;
|
|
let emoji_raw = extract_str_col(&df, "emoji")?;
|
|
let icon_category_raw = extract_str_col(&df, "icon_category")?;
|
|
|
|
// Pack POI IDs into a contiguous buffer
|
|
let total_id_bytes: usize = id_raw.iter().map(|s| s.len()).sum();
|
|
let mut id_buffer = String::with_capacity(total_id_bytes);
|
|
let mut id_offsets = Vec::with_capacity(row_count);
|
|
let mut id_lengths = Vec::with_capacity(row_count);
|
|
for s in &id_raw {
|
|
let offset = id_buffer.len() as u32;
|
|
let length = s.len().min(u16::MAX as usize) as u16;
|
|
id_offsets.push(offset);
|
|
id_lengths.push(length);
|
|
id_buffer.push_str(&s[..length as usize]);
|
|
}
|
|
|
|
let category = InternedColumn::build(&category_raw);
|
|
let icon_category = InternedColumn::build(&icon_category_raw);
|
|
let group = InternedColumn::build(&group_raw);
|
|
let emoji = InternedColumn::build(&emoji_raw);
|
|
|
|
info!(
|
|
category_unique = category.values.len(),
|
|
icon_category_unique = icon_category.values.len(),
|
|
group_unique = group.values.len(),
|
|
emoji_unique = emoji.values.len(),
|
|
"POI string columns interned"
|
|
);
|
|
|
|
// Assign a deterministic pseudo-random priority to each row.
|
|
// This ensures the same POIs are selected across requests,
|
|
// preventing visual "shuffling" when panning the map.
|
|
let priority = generate_priorities(row_count);
|
|
|
|
info!("POI data loading complete.");
|
|
|
|
Ok(POIData {
|
|
id_buffer,
|
|
id_offsets,
|
|
id_lengths,
|
|
name,
|
|
category,
|
|
icon_category,
|
|
group,
|
|
lat,
|
|
lng,
|
|
emoji,
|
|
priority,
|
|
})
|
|
}
|
|
|
|
/// Build dashboard category groups from every category present in the loaded POI data.
|
|
pub fn category_groups(&self) -> anyhow::Result<Vec<POICategoryGroup>> {
|
|
let mut group_cats: HashMap<String, HashSet<String>> = HashMap::new();
|
|
let num_pois = self.category.indices.len();
|
|
for row in 0..num_pois {
|
|
let category = self.category.get(row).to_string();
|
|
let group = self.group.get(row).to_string();
|
|
group_cats.entry(group).or_default().insert(category);
|
|
}
|
|
|
|
// Validate that data groups match the hardcoded order exactly
|
|
let expected: HashSet<&str> = POI_GROUP_ORDER.iter().copied().collect();
|
|
let actual: HashSet<&str> = group_cats.keys().map(|key| key.as_str()).collect();
|
|
let missing_from_data: Vec<&&str> = expected.difference(&actual).collect();
|
|
let missing_from_order: Vec<&&str> = actual.difference(&expected).collect();
|
|
if !missing_from_data.is_empty() || !missing_from_order.is_empty() {
|
|
bail!(
|
|
"POI group mismatch!\n In POI_GROUP_ORDER but not in data: {:?}\n In data but not in POI_GROUP_ORDER: {:?}",
|
|
missing_from_data, missing_from_order
|
|
);
|
|
}
|
|
|
|
let preferred_order: HashMap<&str, HashMap<&str, usize>> = DASHBOARD_POI_GROUPS
|
|
.iter()
|
|
.map(|(group, categories)| {
|
|
(
|
|
*group,
|
|
categories
|
|
.iter()
|
|
.enumerate()
|
|
.map(|(idx, category)| (*category, idx))
|
|
.collect(),
|
|
)
|
|
})
|
|
.collect();
|
|
|
|
let groups: Vec<POICategoryGroup> = POI_GROUP_ORDER
|
|
.iter()
|
|
.filter_map(|group_name| {
|
|
let mut categories: Vec<String> = group_cats
|
|
.get(*group_name)
|
|
.map(|categories| categories.iter().cloned().collect())
|
|
.unwrap_or_default();
|
|
if categories.is_empty() {
|
|
return None;
|
|
}
|
|
let group_order = preferred_order.get(*group_name);
|
|
categories.sort_by(|a, b| {
|
|
let a_order = group_order.and_then(|order| order.get(a.as_str())).copied();
|
|
let b_order = group_order.and_then(|order| order.get(b.as_str())).copied();
|
|
match (a_order, b_order) {
|
|
(Some(left), Some(right)) => left.cmp(&right),
|
|
(Some(_), None) => std::cmp::Ordering::Less,
|
|
(None, Some(_)) => std::cmp::Ordering::Greater,
|
|
(None, None) => a.cmp(b),
|
|
}
|
|
});
|
|
Some(POICategoryGroup {
|
|
name: (*group_name).to_string(),
|
|
categories,
|
|
})
|
|
})
|
|
.collect();
|
|
|
|
Ok(groups)
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn category_filter_matches_exact_present_categories() {
|
|
let values = vec![
|
|
"Supermarket".to_string(),
|
|
"Tesco".to_string(),
|
|
"Aldi".to_string(),
|
|
"Rail station".to_string(),
|
|
];
|
|
|
|
let selected = resolve_poi_category_filter(&values, "Supermarket,Rail station");
|
|
|
|
assert!(selected.contains(&0));
|
|
assert!(selected.contains(&3));
|
|
assert_eq!(selected.len(), 2);
|
|
}
|
|
|
|
#[test]
|
|
fn unknown_category_filter_matches_nothing() {
|
|
let values = vec!["Supermarket".to_string()];
|
|
|
|
let selected = resolve_poi_category_filter(&values, "Unknown");
|
|
|
|
assert!(selected.is_empty());
|
|
}
|
|
}
|