Rust things
This commit is contained in:
parent
fc10381692
commit
3debacab4f
30 changed files with 3257 additions and 647 deletions
|
|
@ -97,7 +97,7 @@ fn build_search_text(name: &str, place_type: &str) -> String {
|
|||
}
|
||||
|
||||
if place_type == "station" {
|
||||
let suffix_aliases: [(&str, &[&str]); 5] = [
|
||||
let suffix_aliases: [(&str, &[&str]); 6] = [
|
||||
(
|
||||
" tube station",
|
||||
&[" underground station", " station", " tube", " underground"],
|
||||
|
|
@ -118,6 +118,7 @@ fn build_search_text(name: &str, place_type: &str) -> String {
|
|||
" elizabeth line station",
|
||||
&[" station", " elizabeth line", " crossrail station"],
|
||||
),
|
||||
(" dlr station", &[" station", " dlr"]),
|
||||
];
|
||||
|
||||
for (suffix, replacements) in suffix_aliases {
|
||||
|
|
@ -139,10 +140,15 @@ fn extract_str_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<String>> {
|
|||
let string_column = column
|
||||
.str()
|
||||
.with_context(|| format!("Column '{name}' is not a string column"))?;
|
||||
Ok(string_column
|
||||
string_column
|
||||
.into_iter()
|
||||
.map(|value| value.unwrap_or("").to_string())
|
||||
.collect())
|
||||
.enumerate()
|
||||
.map(|(row, value)| {
|
||||
value
|
||||
.map(ToString::to_string)
|
||||
.with_context(|| format!("Column '{name}' has null at row {row}"))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn extract_f32_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<f32>> {
|
||||
|
|
@ -155,33 +161,37 @@ fn extract_f32_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<f32>> {
|
|||
let float_column = cast
|
||||
.f32()
|
||||
.with_context(|| format!("Column '{name}' is not a float32 column"))?;
|
||||
Ok(float_column
|
||||
float_column
|
||||
.into_iter()
|
||||
.map(|value| value.unwrap_or(0.0))
|
||||
.collect())
|
||||
.enumerate()
|
||||
.map(|(row, value)| value.with_context(|| format!("Column '{name}' has null at row {row}")))
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn extract_bool_col_or_default(
|
||||
df: &DataFrame,
|
||||
name: &str,
|
||||
default_value: bool,
|
||||
) -> anyhow::Result<Vec<bool>> {
|
||||
let Ok(column) = df.column(name) else {
|
||||
return Ok(vec![default_value; df.height()]);
|
||||
};
|
||||
fn extract_bool_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<bool>> {
|
||||
let column = df
|
||||
.column(name)
|
||||
.with_context(|| format!("Missing column '{name}' in places data"))?;
|
||||
let bool_column = column
|
||||
.bool()
|
||||
.with_context(|| format!("Column '{name}' is not a boolean column"))?;
|
||||
Ok(bool_column
|
||||
bool_column
|
||||
.into_iter()
|
||||
.map(|value| value.unwrap_or(default_value))
|
||||
.collect())
|
||||
.enumerate()
|
||||
.map(|(row, value)| value.with_context(|| format!("Column '{name}' has null at row {row}")))
|
||||
.collect()
|
||||
}
|
||||
|
||||
impl PlaceData {
|
||||
pub fn load(parquet_path: &Path) -> anyhow::Result<Self> {
|
||||
super::run_polars_io(|| Self::load_inner(parquet_path))
|
||||
}
|
||||
|
||||
fn load_inner(parquet_path: &Path) -> anyhow::Result<Self> {
|
||||
info!("Loading place data from {:?}...", parquet_path);
|
||||
|
||||
let parquet_path = PlRefPath::try_from_path(parquet_path)
|
||||
.context("Failed to normalize places parquet path")?;
|
||||
let df = LazyFrame::scan_parquet(parquet_path, Default::default())
|
||||
.context("Failed to scan places parquet")?
|
||||
.collect()
|
||||
|
|
@ -210,7 +220,7 @@ impl PlaceData {
|
|||
let type_rank_vec: Vec<u8> = place_type_raw.iter().map(|pt| type_rank(pt)).collect();
|
||||
let place_type = InternedColumn::build(&place_type_raw);
|
||||
let travel_destination = if df.column("travel_destination").is_ok() {
|
||||
extract_bool_col_or_default(&df, "travel_destination", true)?
|
||||
extract_bool_col(&df, "travel_destination")?
|
||||
} else {
|
||||
place_type_raw
|
||||
.iter()
|
||||
|
|
@ -296,6 +306,7 @@ mod tests {
|
|||
assert!(build_search_text("King's Cross tube station", "station")
|
||||
.contains("kings cross underground"));
|
||||
assert!(build_search_text("St Albans", "city").contains("saint albans"));
|
||||
assert!(build_search_text("Shadwell DLR station", "station").contains("shadwell station"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ use anyhow::{bail, Context};
|
|||
use polars::frame::DataFrame;
|
||||
use polars::lazy::frame::LazyFrame;
|
||||
use polars::prelude::*;
|
||||
use rustc_hash::FxHashSet;
|
||||
use serde::Serialize;
|
||||
use tracing::info;
|
||||
|
||||
|
|
@ -17,6 +18,94 @@ pub struct POICategoryGroup {
|
|||
pub categories: Vec<String>,
|
||||
}
|
||||
|
||||
const GROCERY_DASHBOARD_CATEGORIES: &[&str] = &[
|
||||
"Supermarket",
|
||||
"Convenience Store",
|
||||
"Bakery",
|
||||
"Greengrocer",
|
||||
"Aldi",
|
||||
"Amazon",
|
||||
"Asda",
|
||||
"Booths",
|
||||
"Budgens",
|
||||
"Centra",
|
||||
"Co-op",
|
||||
"COOK",
|
||||
"Costco",
|
||||
"Dunnes Stores",
|
||||
"Farmfoods",
|
||||
"Heron Foods",
|
||||
"Iceland",
|
||||
"Lidl",
|
||||
"Makro",
|
||||
"M&S",
|
||||
"Morrisons",
|
||||
"Planet Organic",
|
||||
"Sainsbury's",
|
||||
"Spar",
|
||||
"Tesco",
|
||||
"The Food Warehouse",
|
||||
"Waitrose",
|
||||
"Whole Foods Market",
|
||||
];
|
||||
|
||||
const DASHBOARD_POI_GROUPS: &[(&str, &[&str])] = &[
|
||||
(
|
||||
"Public Transport",
|
||||
&[
|
||||
"Rail station",
|
||||
"Tube station",
|
||||
"Bus station",
|
||||
"Bus stop",
|
||||
"Airport",
|
||||
],
|
||||
),
|
||||
("Groceries", GROCERY_DASHBOARD_CATEGORIES),
|
||||
("Food & Drink", &["Café", "Restaurant", "Pub", "Fast Food"]),
|
||||
("Green Space", &["Park", "Playground"]),
|
||||
("Education", &["School"]),
|
||||
(
|
||||
"Health",
|
||||
&["GP Surgery", "Pharmacy", "Dentist", "Hospital & Clinic"],
|
||||
),
|
||||
(
|
||||
"Leisure",
|
||||
&[
|
||||
"Gym & Fitness",
|
||||
"Sports Centre",
|
||||
"Cinema",
|
||||
"Theatre",
|
||||
"Library",
|
||||
],
|
||||
),
|
||||
(
|
||||
"Practical",
|
||||
&["Post Office", "Bank", "EV Charging", "Fuel Station"],
|
||||
),
|
||||
];
|
||||
|
||||
fn add_category_filter_index(
|
||||
category_values: &[String],
|
||||
category: &str,
|
||||
selected: &mut FxHashSet<u16>,
|
||||
) {
|
||||
if let Some(pos) = category_values.iter().position(|value| value == category) {
|
||||
selected.insert(pos as u16);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn resolve_poi_category_filter(category_values: &[String], categories: &str) -> FxHashSet<u16> {
|
||||
let mut selected = FxHashSet::default();
|
||||
for part in categories.split(',') {
|
||||
let category = part.trim();
|
||||
if category.is_empty() {
|
||||
continue;
|
||||
}
|
||||
add_category_filter_index(category_values, category, &mut selected);
|
||||
}
|
||||
selected
|
||||
}
|
||||
|
||||
pub struct POIData {
|
||||
/// Contiguous buffer holding all POI ID strings end-to-end.
|
||||
id_buffer: String,
|
||||
|
|
@ -53,13 +142,18 @@ fn extract_str_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<String>> {
|
|||
let string_column = column
|
||||
.str()
|
||||
.with_context(|| format!("Column '{name}' is not a string column"))?;
|
||||
Ok(string_column
|
||||
string_column
|
||||
.into_iter()
|
||||
.map(|value| value.unwrap_or("").to_string())
|
||||
.collect())
|
||||
.enumerate()
|
||||
.map(|(row, value)| {
|
||||
value
|
||||
.map(ToString::to_string)
|
||||
.with_context(|| format!("Column '{name}' has null at row {row}"))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn extract_f32_col(df: &DataFrame, name: &str, default: f32) -> anyhow::Result<Vec<f32>> {
|
||||
fn extract_f32_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<f32>> {
|
||||
let column = df
|
||||
.column(name)
|
||||
.with_context(|| format!("Missing column '{name}' in POI data"))?;
|
||||
|
|
@ -69,16 +163,23 @@ fn extract_f32_col(df: &DataFrame, name: &str, default: f32) -> anyhow::Result<V
|
|||
let float_column = cast
|
||||
.f32()
|
||||
.with_context(|| format!("Column '{name}' is not a float32 column"))?;
|
||||
Ok(float_column
|
||||
float_column
|
||||
.into_iter()
|
||||
.map(|value| value.unwrap_or(default))
|
||||
.collect())
|
||||
.enumerate()
|
||||
.map(|(row, value)| value.with_context(|| format!("Column '{name}' has null at row {row}")))
|
||||
.collect()
|
||||
}
|
||||
|
||||
impl POIData {
|
||||
pub fn load(parquet_path: &Path) -> anyhow::Result<Self> {
|
||||
super::run_polars_io(|| Self::load_inner(parquet_path))
|
||||
}
|
||||
|
||||
fn load_inner(parquet_path: &Path) -> anyhow::Result<Self> {
|
||||
info!("Loading POI data from {:?}...", parquet_path);
|
||||
|
||||
let parquet_path = PlRefPath::try_from_path(parquet_path)
|
||||
.context("Failed to normalize POI parquet path")?;
|
||||
let df = LazyFrame::scan_parquet(parquet_path, Default::default())
|
||||
.context("Failed to scan POI parquet")?
|
||||
.collect()
|
||||
|
|
@ -91,18 +192,10 @@ impl POIData {
|
|||
let name = extract_str_col(&df, "name")?;
|
||||
let category_raw = extract_str_col(&df, "category")?;
|
||||
let group_raw = extract_str_col(&df, "group")?;
|
||||
let lat = extract_f32_col(&df, "lat", 0.0)?;
|
||||
let lng = extract_f32_col(&df, "lng", 0.0)?;
|
||||
let lat = extract_f32_col(&df, "lat")?;
|
||||
let lng = extract_f32_col(&df, "lng")?;
|
||||
let emoji_raw = extract_str_col(&df, "emoji")?;
|
||||
let icon_category_raw = if df
|
||||
.get_column_names()
|
||||
.iter()
|
||||
.any(|name| name.as_str() == "icon_category")
|
||||
{
|
||||
extract_str_col(&df, "icon_category")?
|
||||
} else {
|
||||
category_raw.clone()
|
||||
};
|
||||
let icon_category_raw = extract_str_col(&df, "icon_category")?;
|
||||
|
||||
// Pack POI IDs into a contiguous buffer
|
||||
let total_id_bytes: usize = id_raw.iter().map(|s| s.len()).sum();
|
||||
|
|
@ -152,7 +245,7 @@ impl POIData {
|
|||
})
|
||||
}
|
||||
|
||||
/// Build category groups from the loaded POI data, validated against POI_GROUP_ORDER.
|
||||
/// Build dashboard category groups from every category present in the loaded POI data.
|
||||
pub fn category_groups(&self) -> anyhow::Result<Vec<POICategoryGroup>> {
|
||||
let mut group_cats: HashMap<String, HashSet<String>> = HashMap::new();
|
||||
let num_pois = self.category.indices.len();
|
||||
|
|
@ -174,18 +267,78 @@ impl POIData {
|
|||
);
|
||||
}
|
||||
|
||||
POI_GROUP_ORDER
|
||||
let preferred_order: HashMap<&str, HashMap<&str, usize>> = DASHBOARD_POI_GROUPS
|
||||
.iter()
|
||||
.map(|group_name| {
|
||||
let name = group_name.to_string();
|
||||
let mut categories: Vec<String> = group_cats
|
||||
.remove(&name)
|
||||
.context("POI group validated but missing from map")?
|
||||
.into_iter()
|
||||
.collect();
|
||||
categories.sort();
|
||||
Ok(POICategoryGroup { name, categories })
|
||||
.map(|(group, categories)| {
|
||||
(
|
||||
*group,
|
||||
categories
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(idx, category)| (*category, idx))
|
||||
.collect(),
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
.collect();
|
||||
|
||||
let groups: Vec<POICategoryGroup> = POI_GROUP_ORDER
|
||||
.iter()
|
||||
.filter_map(|group_name| {
|
||||
let mut categories: Vec<String> = group_cats
|
||||
.get(*group_name)
|
||||
.map(|categories| categories.iter().cloned().collect())
|
||||
.unwrap_or_default();
|
||||
if categories.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let group_order = preferred_order.get(*group_name);
|
||||
categories.sort_by(|a, b| {
|
||||
let a_order = group_order.and_then(|order| order.get(a.as_str())).copied();
|
||||
let b_order = group_order.and_then(|order| order.get(b.as_str())).copied();
|
||||
match (a_order, b_order) {
|
||||
(Some(left), Some(right)) => left.cmp(&right),
|
||||
(Some(_), None) => std::cmp::Ordering::Less,
|
||||
(None, Some(_)) => std::cmp::Ordering::Greater,
|
||||
(None, None) => a.cmp(b),
|
||||
}
|
||||
});
|
||||
Some(POICategoryGroup {
|
||||
name: (*group_name).to_string(),
|
||||
categories,
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(groups)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn category_filter_matches_exact_present_categories() {
|
||||
let values = vec![
|
||||
"Supermarket".to_string(),
|
||||
"Tesco".to_string(),
|
||||
"Aldi".to_string(),
|
||||
"Rail station".to_string(),
|
||||
];
|
||||
|
||||
let selected = resolve_poi_category_filter(&values, "Supermarket,Rail station");
|
||||
|
||||
assert!(selected.contains(&0));
|
||||
assert!(selected.contains(&3));
|
||||
assert_eq!(selected.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unknown_category_filter_matches_nothing() {
|
||||
let values = vec!["Supermarket".to_string()];
|
||||
|
||||
let selected = resolve_poi_category_filter(&values, "Unknown");
|
||||
|
||||
assert!(selected.is_empty());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -195,33 +195,38 @@ impl PostcodeData {
|
|||
|
||||
// Extract all outer rings from the geometry
|
||||
let rings: Vec<Vec<[f32; 2]>> = match feature.geometry {
|
||||
Geometry::Polygon { coordinates } => coordinates
|
||||
.first()
|
||||
.map(|ring| {
|
||||
vec![ring
|
||||
.iter()
|
||||
.map(|[lon, lat]| [*lon as f32, *lat as f32])
|
||||
.collect()]
|
||||
})
|
||||
.unwrap_or_default(),
|
||||
Geometry::Polygon { coordinates } => {
|
||||
let ring = coordinates.first().with_context(|| {
|
||||
format!("Postcode '{postcode}' polygon has no outer ring")
|
||||
})?;
|
||||
vec![ring
|
||||
.iter()
|
||||
.map(|[lon, lat]| [*lon as f32, *lat as f32])
|
||||
.collect()]
|
||||
}
|
||||
Geometry::MultiPolygon { coordinates } => coordinates
|
||||
.iter()
|
||||
.filter_map(|poly| {
|
||||
poly.first().map(|ring| {
|
||||
ring.iter()
|
||||
.map(|[lon, lat]| [*lon as f32, *lat as f32])
|
||||
.collect()
|
||||
})
|
||||
.enumerate()
|
||||
.map(|(idx, poly)| {
|
||||
let ring = poly.first().with_context(|| {
|
||||
format!(
|
||||
"Postcode '{postcode}' multipolygon part {idx} has no outer ring"
|
||||
)
|
||||
})?;
|
||||
Ok(ring
|
||||
.iter()
|
||||
.map(|[lon, lat]| [*lon as f32, *lat as f32])
|
||||
.collect())
|
||||
})
|
||||
.collect(),
|
||||
.collect::<anyhow::Result<Vec<_>>>()?,
|
||||
};
|
||||
|
||||
// Compute centroid across all vertices from all rings
|
||||
let total_vertices: usize = rings.iter().map(|ring| ring.len()).sum();
|
||||
let centroid = if total_vertices == 0 {
|
||||
tracing::warn!(postcode = %postcode, "Postcode polygon has zero vertices, defaulting centroid to (0,0)");
|
||||
(0.0, 0.0)
|
||||
} else {
|
||||
if total_vertices == 0 {
|
||||
anyhow::bail!("Postcode '{postcode}' polygon has zero vertices");
|
||||
}
|
||||
let centroid = {
|
||||
let mut sum_lat: f32 = 0.0;
|
||||
let mut sum_lon: f32 = 0.0;
|
||||
for ring in &rings {
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ const ADDRESS_SEARCH_CANDIDATE_LIMIT: usize = 50_000;
|
|||
const ADDRESS_SEARCH_MAX_POSTINGS_PER_TOKEN: usize = 250_000;
|
||||
const ADDRESS_SEARCH_PREFIX_MIN_LEN: usize = 4;
|
||||
const ADDRESS_SEARCH_PREFIX_MAX_LEN: usize = 8;
|
||||
const NO_POI_METRIC_ROW: u32 = u32::MAX;
|
||||
|
||||
fn is_numeric_dtype(dtype: &DataType) -> bool {
|
||||
matches!(
|
||||
|
|
@ -495,6 +496,187 @@ impl QuantRef<'_> {
|
|||
}
|
||||
}
|
||||
|
||||
pub struct PostcodePoiMetrics {
|
||||
pub feature_names: Vec<String>,
|
||||
pub name_to_index: FxHashMap<String, usize>,
|
||||
/// Metric-major storage: columns[metric_idx][postcode_metric_idx].
|
||||
pub columns: Vec<Vec<u16>>,
|
||||
pub feature_stats: Vec<FeatureStats>,
|
||||
/// Per-property row lookup into the postcode metric table.
|
||||
row_to_metric_idx: Vec<u32>,
|
||||
dequant_a: Vec<f32>,
|
||||
quant_min: Vec<f32>,
|
||||
quant_range: Vec<f32>,
|
||||
}
|
||||
|
||||
impl PostcodePoiMetrics {
|
||||
fn empty(row_count: usize) -> Self {
|
||||
Self {
|
||||
feature_names: Vec::new(),
|
||||
name_to_index: FxHashMap::default(),
|
||||
columns: Vec::new(),
|
||||
feature_stats: Vec::new(),
|
||||
row_to_metric_idx: vec![NO_POI_METRIC_ROW; row_count],
|
||||
dequant_a: Vec::new(),
|
||||
quant_min: Vec::new(),
|
||||
quant_range: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn from_postcode_df(df: &DataFrame, feature_names: Vec<String>) -> anyhow::Result<Self> {
|
||||
if feature_names.is_empty() {
|
||||
return Ok(Self::empty(0));
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
metrics = feature_names.len(),
|
||||
postcodes = df.height(),
|
||||
"Building postcode POI metric side table"
|
||||
);
|
||||
|
||||
let col_major: Vec<Vec<f32>> = feature_names
|
||||
.par_iter()
|
||||
.map(|name| {
|
||||
let column = df
|
||||
.column(name.as_str())
|
||||
.with_context(|| format!("Missing POI metric column '{name}'"))?;
|
||||
column_to_f32_vec(column)
|
||||
})
|
||||
.collect::<anyhow::Result<Vec<_>>>()?;
|
||||
|
||||
let feature_stats: Vec<FeatureStats> = col_major
|
||||
.par_iter()
|
||||
.enumerate()
|
||||
.map(|(metric_idx, vals)| {
|
||||
let name = feature_names[metric_idx].as_str();
|
||||
let bounds = features::bounds_for(name)
|
||||
.with_context(|| format!("No bounds config for POI metric '{name}'"))?;
|
||||
Ok(compute_feature_stats(
|
||||
vals,
|
||||
&bounds,
|
||||
features::has_integer_bins(name),
|
||||
))
|
||||
})
|
||||
.collect::<anyhow::Result<Vec<_>>>()?;
|
||||
|
||||
let mut quant_min = Vec::with_capacity(feature_names.len());
|
||||
let mut quant_range = Vec::with_capacity(feature_names.len());
|
||||
for (metric_idx, stats) in feature_stats.iter().enumerate() {
|
||||
let (min, max) = match features::bounds_for(feature_names[metric_idx].as_str()) {
|
||||
Some(Bounds::Fixed { min, max }) => (min, max),
|
||||
_ => (stats.histogram.min, stats.histogram.max),
|
||||
};
|
||||
quant_min.push(min);
|
||||
quant_range.push(if max > min { max - min } else { 0.0 });
|
||||
}
|
||||
let dequant_a: Vec<f32> = quant_range
|
||||
.iter()
|
||||
.map(|&range| {
|
||||
if range > 0.0 {
|
||||
range / QUANT_SCALE
|
||||
} else {
|
||||
0.0
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
let columns: Vec<Vec<u16>> = col_major
|
||||
.par_iter()
|
||||
.enumerate()
|
||||
.map(|(metric_idx, vals)| {
|
||||
let range = quant_range[metric_idx];
|
||||
let min = quant_min[metric_idx];
|
||||
vals.iter()
|
||||
.map(|&value| {
|
||||
if !value.is_finite() {
|
||||
NAN_U16
|
||||
} else if range > 0.0 {
|
||||
let normalized = (value - min) / range;
|
||||
(normalized * QUANT_SCALE).round().clamp(0.0, QUANT_SCALE) as u16
|
||||
} else {
|
||||
0
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let name_to_index = feature_names
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(idx, name)| (name.clone(), idx))
|
||||
.collect();
|
||||
|
||||
Ok(Self {
|
||||
feature_names,
|
||||
name_to_index,
|
||||
columns,
|
||||
feature_stats,
|
||||
row_to_metric_idx: Vec::new(),
|
||||
dequant_a,
|
||||
quant_min,
|
||||
quant_range,
|
||||
})
|
||||
}
|
||||
|
||||
fn set_row_mapping(&mut self, row_to_metric_idx: Vec<u32>) {
|
||||
self.row_to_metric_idx = row_to_metric_idx;
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.feature_names.is_empty()
|
||||
}
|
||||
|
||||
pub fn num_features(&self) -> usize {
|
||||
self.feature_names.len()
|
||||
}
|
||||
|
||||
pub fn quant_ref(&self) -> QuantRef<'_> {
|
||||
QuantRef {
|
||||
dequant_a: &self.dequant_a,
|
||||
quant_min: &self.quant_min,
|
||||
quant_range: &self.quant_range,
|
||||
num_numeric: self.feature_names.len(),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn metric_row_for_property(&self, row: usize) -> Option<usize> {
|
||||
self.row_to_metric_idx
|
||||
.get(row)
|
||||
.copied()
|
||||
.filter(|&idx| idx != NO_POI_METRIC_ROW)
|
||||
.map(|idx| idx as usize)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn raw_for_metric_row(&self, metric_row: usize, metric_idx: usize) -> u16 {
|
||||
self.columns[metric_idx][metric_row]
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn raw_for_property_row(&self, row: usize, metric_idx: usize) -> u16 {
|
||||
let Some(metric_row) = self.metric_row_for_property(row) else {
|
||||
return NAN_U16;
|
||||
};
|
||||
self.raw_for_metric_row(metric_row, metric_idx)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn decode_raw(&self, metric_idx: usize, raw: u16) -> f32 {
|
||||
if raw == NAN_U16 {
|
||||
f32::NAN
|
||||
} else {
|
||||
raw as f32 * self.dequant_a[metric_idx] + self.quant_min[metric_idx]
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn get_for_property_row(&self, row: usize, metric_idx: usize) -> f32 {
|
||||
self.decode_raw(metric_idx, self.raw_for_property_row(row, metric_idx))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PropertyData {
|
||||
pub lat: Vec<f32>,
|
||||
pub lon: Vec<f32>,
|
||||
|
|
@ -514,6 +696,7 @@ pub struct PropertyData {
|
|||
/// Per-feature: max - min (for encoding filter bounds).
|
||||
quant_range: Vec<f32>,
|
||||
pub feature_stats: Vec<FeatureStats>,
|
||||
pub poi_metrics: PostcodePoiMetrics,
|
||||
/// Unquantized last sale price used by the price-history chart.
|
||||
last_known_price_raw: Vec<f32>,
|
||||
/// Contiguous buffer holding all address strings end-to-end.
|
||||
|
|
@ -1055,19 +1238,54 @@ pub fn precompute_h3(lat: &[f32], lon: &[f32]) -> anyhow::Result<Vec<u64>> {
|
|||
|
||||
impl PropertyData {
|
||||
pub fn load(properties_path: &Path, postcode_features_path: &Path) -> anyhow::Result<Self> {
|
||||
super::run_polars_io(|| Self::load_inner(properties_path, postcode_features_path))
|
||||
}
|
||||
|
||||
fn load_inner(properties_path: &Path, postcode_features_path: &Path) -> anyhow::Result<Self> {
|
||||
// Load postcode.parquet
|
||||
tracing::info!(
|
||||
"Loading postcode features from {:?}",
|
||||
postcode_features_path
|
||||
);
|
||||
let postcode_features_path = PlRefPath::try_from_path(postcode_features_path)
|
||||
.context("Failed to normalize postcode parquet path")?;
|
||||
let postcode_df = LazyFrame::scan_parquet(postcode_features_path, Default::default())
|
||||
.context("Failed to scan postcode parquet")?
|
||||
.collect()
|
||||
.context("Failed to read postcode parquet")?;
|
||||
tracing::info!(rows = postcode_df.height(), "Postcode features loaded");
|
||||
|
||||
let mut poi_metric_names: Vec<String> = postcode_df
|
||||
.get_column_names()
|
||||
.iter()
|
||||
.map(|name| name.as_str())
|
||||
.filter(|&name| features::is_dynamic_poi_feature(name))
|
||||
.map(str::to_string)
|
||||
.collect();
|
||||
poi_metric_names.sort_by_key(|name| features::dynamic_poi_feature_sort_key(name));
|
||||
|
||||
let poi_metric_by_postcode: FxHashMap<String, u32> = if poi_metric_names.is_empty() {
|
||||
FxHashMap::default()
|
||||
} else {
|
||||
let postcode_column = postcode_df
|
||||
.column("Postcode")
|
||||
.context("Postcode feature parquet missing 'Postcode' column")?
|
||||
.str()
|
||||
.context("'Postcode' column in postcode feature parquet is not a string")?;
|
||||
postcode_column
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.filter_map(|(idx, postcode)| {
|
||||
postcode.map(|postcode| (postcode.to_string(), idx as u32))
|
||||
})
|
||||
.collect()
|
||||
};
|
||||
let mut poi_metrics = PostcodePoiMetrics::from_postcode_df(&postcode_df, poi_metric_names)?;
|
||||
|
||||
// Load properties.parquet and join with postcode data for lat/lon + area features
|
||||
tracing::info!("Loading properties from {:?}", properties_path);
|
||||
let properties_path = PlRefPath::try_from_path(properties_path)
|
||||
.context("Failed to normalize properties parquet path")?;
|
||||
let properties_lf = LazyFrame::scan_parquet(properties_path, Default::default())
|
||||
.context("Failed to scan properties parquet")?;
|
||||
let combined = properties_lf
|
||||
|
|
@ -1082,14 +1300,20 @@ impl PropertyData {
|
|||
let total_rows = combined.height();
|
||||
tracing::info!(rows = total_rows, "Properties joined with postcodes");
|
||||
|
||||
// Get configured feature/enum names in config order
|
||||
let numeric_names = features::all_numeric_feature_names();
|
||||
// Get configured feature/enum names in config order. Dynamic POI
|
||||
// metrics live in a postcode-level side table so they do not widen the
|
||||
// hot row-major property feature matrix.
|
||||
let configured_numeric_names = features::all_numeric_feature_names();
|
||||
let enum_names = features::all_enum_feature_names();
|
||||
|
||||
let schema = combined.schema();
|
||||
let numeric_names: Vec<String> = configured_numeric_names
|
||||
.iter()
|
||||
.map(|name| (*name).to_string())
|
||||
.collect();
|
||||
|
||||
for name in &numeric_names {
|
||||
match schema.get(name) {
|
||||
match schema.get(name.as_str()) {
|
||||
Some(dtype) if is_numeric_dtype(dtype) => {}
|
||||
Some(dtype) => bail!(
|
||||
"Configured numeric feature '{}' has non-numeric type {:?}",
|
||||
|
|
@ -1120,8 +1344,8 @@ impl PropertyData {
|
|||
// Combine numeric and enum feature names (numeric first, then enum)
|
||||
let feature_names: Vec<String> = numeric_names
|
||||
.iter()
|
||||
.chain(enum_names.iter())
|
||||
.map(|name| name.to_string())
|
||||
.chain(enum_names.iter().map(|name| name.to_string()))
|
||||
.collect();
|
||||
let num_features = feature_names.len();
|
||||
let num_numeric = numeric_names.len();
|
||||
|
|
@ -1138,16 +1362,16 @@ impl PropertyData {
|
|||
select_exprs.push(col("lon").cast(DataType::Float32));
|
||||
|
||||
// Select numeric features as Float32 (datetime columns → fractional year)
|
||||
for &name in &numeric_names {
|
||||
if is_datetime_dtype(schema.get(name).unwrap()) {
|
||||
for name in &numeric_names {
|
||||
if is_datetime_dtype(schema.get(name.as_str()).unwrap()) {
|
||||
select_exprs.push(
|
||||
(col(name).dt().year().cast(DataType::Float32)
|
||||
+ (col(name).dt().month().cast(DataType::Float32) - lit(1.0f32))
|
||||
(col(name.as_str()).dt().year().cast(DataType::Float32)
|
||||
+ (col(name.as_str()).dt().month().cast(DataType::Float32) - lit(1.0f32))
|
||||
/ lit(12.0f32))
|
||||
.alias(name),
|
||||
.alias(name.as_str()),
|
||||
);
|
||||
} else {
|
||||
select_exprs.push(col(name).cast(DataType::Float32));
|
||||
select_exprs.push(col(name.as_str()).cast(DataType::Float32));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1233,7 +1457,7 @@ impl PropertyData {
|
|||
.par_iter()
|
||||
.map(|name| {
|
||||
let column = df
|
||||
.column(name)
|
||||
.column(name.as_str())
|
||||
.with_context(|| format!("Missing feature column '{name}'"))?;
|
||||
column_to_f32_vec(column)
|
||||
})
|
||||
|
|
@ -1244,10 +1468,10 @@ impl PropertyData {
|
|||
.par_iter()
|
||||
.enumerate()
|
||||
.map(|(feat_index, vals)| {
|
||||
let name = numeric_names[feat_index];
|
||||
let name = numeric_names[feat_index].as_str();
|
||||
let bounds = features::bounds_for(name)
|
||||
.with_context(|| format!("No bounds config for feature '{}'", name))?;
|
||||
let stats = compute_feature_stats(vals, bounds, features::has_integer_bins(name));
|
||||
let stats = compute_feature_stats(vals, &bounds, features::has_integer_bins(name));
|
||||
tracing::debug!(
|
||||
feature = %name,
|
||||
slider_min = format_args!("{:.2}", stats.slider_min),
|
||||
|
|
@ -1268,8 +1492,8 @@ impl PropertyData {
|
|||
let mut quant_min = Vec::with_capacity(num_features);
|
||||
let mut quant_range = Vec::with_capacity(num_features);
|
||||
for (feat_idx, stats) in numeric_feature_stats.iter().enumerate() {
|
||||
let (min, max) = match features::bounds_for(numeric_names[feat_idx]) {
|
||||
Some(Bounds::Fixed { min, max }) => (*min, *max),
|
||||
let (min, max) = match features::bounds_for(numeric_names[feat_idx].as_str()) {
|
||||
Some(Bounds::Fixed { min, max }) => (min, max),
|
||||
_ => (stats.histogram.min, stats.histogram.max),
|
||||
};
|
||||
quant_min.push(min);
|
||||
|
|
@ -1284,10 +1508,15 @@ impl PropertyData {
|
|||
let string_column = column
|
||||
.str()
|
||||
.with_context(|| format!("Column '{name}' is not a string column"))?;
|
||||
Ok(string_column
|
||||
string_column
|
||||
.into_iter()
|
||||
.map(|value| value.unwrap_or("").to_string())
|
||||
.collect())
|
||||
.enumerate()
|
||||
.map(|(row, value)| {
|
||||
value
|
||||
.map(ToString::to_string)
|
||||
.with_context(|| format!("Required column '{name}' has null at row {row}"))
|
||||
})
|
||||
.collect()
|
||||
};
|
||||
|
||||
let address_raw = extract_string_col(&df, "Address per Property Register")?;
|
||||
|
|
@ -1325,18 +1554,18 @@ impl PropertyData {
|
|||
// enum_col_major: Vec<(values_list, encoded_as_f32)>
|
||||
let enum_col_major: Vec<(Vec<String>, Vec<f32>)> = enum_names
|
||||
.par_iter()
|
||||
.filter_map(|&name| {
|
||||
let column_data = df.column(name).ok()?;
|
||||
let string_column = column_data.str().ok()?;
|
||||
.map(|&name| -> anyhow::Result<(Vec<String>, Vec<f32>)> {
|
||||
let column_data = df
|
||||
.column(name)
|
||||
.with_context(|| format!("Required enum column '{name}' not found"))?;
|
||||
let string_column = column_data
|
||||
.str()
|
||||
.with_context(|| format!("Enum column '{name}' is not a string column"))?;
|
||||
let unique_set: std::collections::HashSet<String> = string_column
|
||||
.into_iter()
|
||||
.filter_map(|value| {
|
||||
let text = value.unwrap_or("");
|
||||
if text.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(text.to_string())
|
||||
}
|
||||
let text = value?.trim();
|
||||
(!text.is_empty()).then(|| text.to_string())
|
||||
})
|
||||
.collect();
|
||||
|
||||
|
|
@ -1373,20 +1602,22 @@ impl PropertyData {
|
|||
|
||||
let encoded: Vec<f32> = string_column
|
||||
.into_iter()
|
||||
.map(|value| {
|
||||
let text = value.unwrap_or("");
|
||||
if text.is_empty() {
|
||||
f32::NAN
|
||||
} else {
|
||||
*value_to_idx.get(text).unwrap_or(&f32::NAN)
|
||||
}
|
||||
.enumerate()
|
||||
.map(|(row, value)| {
|
||||
let Some(text) = value.map(str::trim).filter(|text| !text.is_empty())
|
||||
else {
|
||||
return Ok(f32::NAN);
|
||||
};
|
||||
value_to_idx.get(text).copied().with_context(|| {
|
||||
format!("Enum column '{name}' has unknown value '{text}' at row {row}")
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
.collect::<anyhow::Result<Vec<_>>>()?;
|
||||
|
||||
tracing::debug!(column = %name, unique_values = unique.len(), "Enum feature encoded as f32");
|
||||
Some((unique, encoded))
|
||||
Ok((unique, encoded))
|
||||
})
|
||||
.collect();
|
||||
.collect::<anyhow::Result<Vec<_>>>()?;
|
||||
|
||||
// Extract is_approx_build_date: 0.0 = exact, anything else (1.0/NaN) = approximate
|
||||
let is_approx_build_date_raw: Vec<bool> = if has_approx_col {
|
||||
|
|
@ -1487,13 +1718,13 @@ impl PropertyData {
|
|||
.collect();
|
||||
let last_known_price_raw: Vec<f32> = numeric_names
|
||||
.iter()
|
||||
.position(|&name| name == "Last known price")
|
||||
.position(|name| name == "Last known price")
|
||||
.map(|price_idx| {
|
||||
perm.iter()
|
||||
.map(|&perm_index| numeric_col_major[price_idx][perm_index as usize])
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_else(|| vec![f32::NAN; row_count]);
|
||||
.context("Required numeric column 'Last known price' not configured")?;
|
||||
|
||||
// Build contiguous address buffer and address search index (permuted)
|
||||
tracing::info!("Building interned strings");
|
||||
|
|
@ -1561,6 +1792,20 @@ impl PropertyData {
|
|||
}
|
||||
let postcode_interner = postcode_rodeo.into_reader();
|
||||
|
||||
let row_to_poi_metric_idx: Vec<u32> = if poi_metrics.is_empty() {
|
||||
vec![NO_POI_METRIC_ROW; row_count]
|
||||
} else {
|
||||
perm.iter()
|
||||
.map(|&old_row| {
|
||||
poi_metric_by_postcode
|
||||
.get(postcode_raw[old_row as usize].as_str())
|
||||
.copied()
|
||||
.unwrap_or(NO_POI_METRIC_ROW)
|
||||
})
|
||||
.collect()
|
||||
};
|
||||
poi_metrics.set_row_mapping(row_to_poi_metric_idx);
|
||||
|
||||
// Pack is_approx_build_date into a bitvec (8 bools per byte)
|
||||
let num_bytes = row_count.div_ceil(8);
|
||||
let mut approx_build_date_bits = vec![0u8; num_bytes];
|
||||
|
|
@ -1697,6 +1942,7 @@ impl PropertyData {
|
|||
quant_min,
|
||||
quant_range,
|
||||
feature_stats,
|
||||
poi_metrics,
|
||||
last_known_price_raw,
|
||||
address_buffer,
|
||||
address_offsets,
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ use std::sync::Arc;
|
|||
use anyhow::Context;
|
||||
use parking_lot::Mutex;
|
||||
use polars::lazy::frame::LazyFrame;
|
||||
use polars::prelude::PlRefPath;
|
||||
use rustc_hash::{FxHashMap, FxHashSet};
|
||||
use tracing::info;
|
||||
|
||||
|
|
@ -155,15 +156,23 @@ impl TravelTimeStore {
|
|||
/// Returns a cached or freshly-loaded postcode → travel_minutes mapping.
|
||||
pub fn get(&self, mode: &str, slug: &str) -> anyhow::Result<TravelData> {
|
||||
let key = (mode.to_string(), slug.to_string());
|
||||
|
||||
// Check cache first
|
||||
{
|
||||
let mut cache = self.cache.lock();
|
||||
if let Some(data) = cache.get(&key) {
|
||||
return Ok(data);
|
||||
}
|
||||
if let Some(data) = self.get_cached(&key) {
|
||||
return Ok(data);
|
||||
}
|
||||
|
||||
super::run_polars_io(|| self.load_uncached(key))
|
||||
}
|
||||
|
||||
fn get_cached(&self, key: &(String, String)) -> Option<TravelData> {
|
||||
let mut cache = self.cache.lock();
|
||||
cache.get(key)
|
||||
}
|
||||
|
||||
fn load_uncached(&self, key: (String, String)) -> anyhow::Result<TravelData> {
|
||||
if let Some(data) = self.get_cached(&key) {
|
||||
return Ok(data);
|
||||
}
|
||||
let (mode, slug) = &key;
|
||||
// Resolve slug to actual filename (may have numeric prefix).
|
||||
// Reject unknown slugs rather than falling back to raw input to prevent path traversal.
|
||||
let file_stem = self
|
||||
|
|
@ -175,7 +184,9 @@ impl TravelTimeStore {
|
|||
.join(mode)
|
||||
.join(format!("{}.parquet", file_stem));
|
||||
|
||||
let df = LazyFrame::scan_parquet(&path, Default::default())
|
||||
let parquet_path = PlRefPath::try_from_path(&path)
|
||||
.with_context(|| format!("Failed to normalize path: {}", path.display()))?;
|
||||
let df = LazyFrame::scan_parquet(parquet_path, Default::default())
|
||||
.with_context(|| format!("Failed to scan: {}", path.display()))?
|
||||
.collect()
|
||||
.with_context(|| format!("Failed to read: {}", path.display()))?;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue