perfect-postcode/server-rs/src/data/travel_time.rs
2026-03-15 17:38:26 +00:00

282 lines
9.6 KiB
Rust

use std::collections::VecDeque;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use anyhow::Context;
use parking_lot::Mutex;
use polars::lazy::frame::LazyFrame;
use rustc_hash::{FxHashMap, FxHashSet};
use tracing::info;
/// Per-postcode travel time data: median, optional best-case (transit only),
/// and optional journey instructions (JSON leg array, transit only with --paths).
#[derive(Clone)]
pub struct TravelDataRow {
pub minutes: i16,
pub best_minutes: Option<i16>,
pub journey: Option<Arc<str>>,
}
/// Cached postcode → travel time data for a single destination file.
pub type TravelData = Arc<FxHashMap<String, TravelDataRow>>;
/// Simple LRU cache for travel time data, limited to `capacity` entries.
struct LruCache {
map: FxHashMap<(String, String), TravelData>,
order: VecDeque<(String, String)>,
capacity: usize,
}
impl LruCache {
fn new(capacity: usize) -> Self {
Self {
map: FxHashMap::default(),
order: VecDeque::with_capacity(capacity),
capacity,
}
}
fn get(&mut self, key: &(String, String)) -> Option<TravelData> {
if let Some(data) = self.map.get(key) {
// Move to front (most recently used)
if let Some(pos) = self.order.iter().position(|k| k == key) {
self.order.remove(pos);
self.order.push_front(key.clone());
}
Some(data.clone())
} else {
None
}
}
fn insert(&mut self, key: (String, String), data: TravelData) {
if self.map.contains_key(&key) {
self.map.insert(key.clone(), data);
if let Some(pos) = self.order.iter().position(|k| k == &key) {
self.order.remove(pos);
}
self.order.push_front(key);
} else {
while self.map.len() >= self.capacity {
if let Some(old_key) = self.order.pop_back() {
self.map.remove(&old_key);
}
}
self.map.insert(key.clone(), data);
self.order.push_front(key);
}
}
}
/// Strip a numeric prefix like "000000-" from a filename stem.
/// "000000-bank-tube-station" → "bank-tube-station"
fn strip_numeric_prefix(stem: &str) -> &str {
if let Some(pos) = stem.find('-') {
if stem[..pos].chars().all(|ch| ch.is_ascii_digit()) {
return &stem[pos + 1..];
}
}
stem
}
/// Manages on-demand loading and caching of precomputed travel time parquet files.
///
/// Directory structure: `{base_dir}/{mode}/{NNNNNN-slug}.parquet`
/// Files have a numeric prefix for uniqueness; lookups use the stripped slug.
/// Each parquet file has columns: `pcds` (String), `travel_minutes` (Int16).
pub struct TravelTimeStore {
base_dir: PathBuf,
/// Available transport modes (subdirectory names, e.g., "bicycle")
pub available_modes: Vec<String>,
/// mode → set of destination slugs (numeric prefix stripped)
pub destinations: FxHashMap<String, FxHashSet<String>>,
/// (mode, stripped_slug) → full filename stem (with numeric prefix)
slug_to_file: FxHashMap<(String, String), String>,
cache: Mutex<LruCache>,
}
impl TravelTimeStore {
/// Scan the travel-times directory to discover available modes and destinations.
/// Filename stems have a numeric prefix (e.g., "000000-bank-tube-station") which
/// is stripped for slug lookups but preserved for file loading.
pub fn load(base_dir: &Path, cache_capacity: usize) -> anyhow::Result<Self> {
let mut available_modes = Vec::new();
let mut destinations: FxHashMap<String, FxHashSet<String>> = FxHashMap::default();
let mut slug_to_file: FxHashMap<(String, String), String> = FxHashMap::default();
for entry in std::fs::read_dir(base_dir)
.with_context(|| format!("Failed to read travel-times dir: {}", base_dir.display()))?
{
let entry = entry?;
let path = entry.path();
if !path.is_dir() {
continue;
}
let mode = entry.file_name().to_string_lossy().to_string();
let mut slugs = FxHashSet::default();
for file_entry in std::fs::read_dir(&path)
.with_context(|| format!("Failed to read mode dir: {}", path.display()))?
{
let file_entry = file_entry?;
let file_name = file_entry.file_name();
let file_name = file_name.to_string_lossy();
if file_name.ends_with(".parquet") {
let file_stem = file_name.trim_end_matches(".parquet");
let slug = strip_numeric_prefix(file_stem).to_string();
slug_to_file.insert((mode.clone(), slug.clone()), file_stem.to_string());
slugs.insert(slug);
}
}
if !slugs.is_empty() {
info!(
mode = mode.as_str(),
destinations = slugs.len(),
"Travel time mode discovered"
);
available_modes.push(mode.clone());
destinations.insert(mode, slugs);
}
}
available_modes.sort();
Ok(Self {
base_dir: base_dir.to_path_buf(),
available_modes,
destinations,
slug_to_file,
cache: Mutex::new(LruCache::new(cache_capacity)),
})
}
/// Load travel time data for a given mode and destination slug.
/// Returns a cached or freshly-loaded postcode → travel_minutes mapping.
pub fn get(&self, mode: &str, slug: &str) -> anyhow::Result<TravelData> {
let key = (mode.to_string(), slug.to_string());
// Check cache first
{
let mut cache = self.cache.lock();
if let Some(data) = cache.get(&key) {
return Ok(data);
}
}
// Resolve slug to actual filename (may have numeric prefix).
// Reject unknown slugs rather than falling back to raw input to prevent path traversal.
let file_stem = self
.slug_to_file
.get(&key)
.ok_or_else(|| anyhow::anyhow!("Unknown travel destination: {mode}/{slug}"))?;
let path = self
.base_dir
.join(mode)
.join(format!("{}.parquet", file_stem));
let df = LazyFrame::scan_parquet(&path, Default::default())
.with_context(|| format!("Failed to scan: {}", path.display()))?
.collect()
.with_context(|| format!("Failed to read: {}", path.display()))?;
let postcodes = df
.column("pcds")
.context("Missing 'pcds' column")?
.str()
.context("'pcds' is not string")?;
let minutes = df
.column("travel_minutes")
.context("Missing 'travel_minutes' column")?
.i16()
.context("'travel_minutes' is not i16")?;
let best = df
.column("best_minutes")
.ok()
.map(|col| col.i16().expect("'best_minutes' is not i16"));
let journeys = df
.column("journey")
.ok()
.map(|col| col.str().expect("'journey' is not string"));
let mut map = FxHashMap::default();
map.reserve(df.height());
for (i, (pc, min)) in postcodes.into_iter().zip(minutes.into_iter()).enumerate() {
if let (Some(pc), Some(min)) = (pc, min) {
let best_min = best.as_ref().and_then(|b| b.get(i));
let journey = journeys.as_ref().and_then(|j| j.get(i)).map(Arc::from);
map.insert(
pc.to_string(),
TravelDataRow {
minutes: min,
best_minutes: best_min,
journey,
},
);
}
}
let data: TravelData = Arc::new(map);
// Insert into cache
{
let mut cache = self.cache.lock();
cache.insert(key, data.clone());
}
Ok(data)
}
/// Check if a mode + slug combination is available.
pub fn has_destination(&self, mode: &str, slug: &str) -> bool {
self.destinations
.get(mode)
.map(|slugs| slugs.contains(slug))
.unwrap_or(false)
}
}
/// Slugify a place name to match travel time file naming convention.
/// "Abbey Hey" → "abbey-hey", "A'Bhuaile Ghlas" → "a-bhuaile-ghlas"
pub fn slugify(name: &str) -> String {
let mut result = String::with_capacity(name.len());
let mut last_was_hyphen = true; // Start true to skip leading hyphens
for ch in name.chars() {
if ch.is_ascii_alphanumeric() {
result.push(ch.to_ascii_lowercase());
last_was_hyphen = false;
} else if !last_was_hyphen {
result.push('-');
last_was_hyphen = true;
}
}
if result.ends_with('-') {
result.pop();
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn slugify_basic() {
assert_eq!(slugify("Abbey Hey"), "abbey-hey");
assert_eq!(slugify("London"), "london");
}
#[test]
fn strip_numeric_prefix_basic() {
assert_eq!(
strip_numeric_prefix("000000-bank-tube-station"),
"bank-tube-station"
);
assert_eq!(strip_numeric_prefix("000123-abbey-hey"), "abbey-hey");
assert_eq!(
strip_numeric_prefix("bank-tube-station"),
"bank-tube-station"
);
assert_eq!(strip_numeric_prefix("london"), "london");
}
}