This commit is contained in:
Andras Schmelczer 2026-05-26 19:45:13 +01:00
parent c645b0f1d4
commit 39ef5c6646
79 changed files with 5660 additions and 2199 deletions

View file

@ -61,6 +61,9 @@ pub struct ActualListingData {
/// overlaid where available. This lets the listings endpoint use the same filter
/// execution path as the property endpoints.
pub filter_feature_data: Vec<u16>,
/// Row-major dynamic postcode POI metrics aligned with
/// PropertyData::poi_metrics.feature_names.
pub poi_filter_feature_data: Vec<u16>,
pub grid: GridIndex,
}
@ -109,16 +112,16 @@ impl ActualListingData {
let listing_status = InternedColumn::build(&opt_to_string(&listing_status_raw));
let filter_feature_data = build_filter_feature_data(
&df,
property_data,
&postcode,
&address,
&property_type_raw,
&leasehold_freehold_raw,
&rooms_total,
&floor_area_sqm,
&asking_price,
&asking_price_per_sqm,
);
)?;
let poi_filter_feature_data = build_poi_filter_feature_data(&df, property_data)?;
let grid = GridIndex::build(&lat, &lon, GRID_CELL_SIZE);
@ -144,6 +147,7 @@ impl ActualListingData {
listing_date_iso,
features,
filter_feature_data,
poi_filter_feature_data,
grid,
})
}
@ -174,49 +178,37 @@ impl ActualListingData {
#[allow(clippy::too_many_arguments)]
fn build_filter_feature_data(
df: &DataFrame,
property_data: Option<&PropertyData>,
postcode: &[String],
address: &[Option<String>],
property_type: &[Option<String>],
leasehold_freehold: &[Option<String>],
rooms_total: &[Option<i32>],
floor_area_sqm: &[Option<f32>],
asking_price: &[Option<i64>],
asking_price_per_sqm: &[Option<f32>],
) -> Vec<u16> {
) -> Result<Vec<u16>> {
let Some(property_data) = property_data else {
return Vec::new();
return Ok(Vec::new());
};
let num_features = property_data.num_features;
let mut feature_data = vec![NAN_U16; postcode.len() * num_features];
let mut joined_rows = 0usize;
let row_count = df.height();
let mut feature_data = vec![NAN_U16; row_count * num_features];
let quant = property_data.quant_ref();
let mut encoded_columns = 0usize;
for (row, postcode_value) in postcode.iter().enumerate() {
let Some(address_value) = address[row]
.as_deref()
.map(str::trim)
.filter(|v| !v.is_empty())
else {
continue;
};
let query = format!("{address_value} {postcode_value}");
let Some(&property_row) = property_data.search_addresses(&query, 1).first() else {
continue;
};
if property_data.postcode(property_row) != postcode_value {
continue;
for (feat_idx, name) in property_data.feature_names.iter().enumerate() {
if feat_idx < property_data.num_numeric {
if let Some(values) = extract_optional_feature_f32(df, name)? {
encode_numeric_feature(&mut feature_data, property_data, &quant, feat_idx, values);
encoded_columns += 1;
}
} else if let Some(values) = extract_optional_feature_str(df, name)? {
encode_enum_feature(&mut feature_data, property_data, feat_idx, values);
encoded_columns += 1;
}
let dst = row * num_features;
let src = property_row * num_features;
feature_data[dst..dst + num_features]
.copy_from_slice(&property_data.feature_data[src..src + num_features]);
joined_rows += 1;
}
let quant = property_data.quant_ref();
overlay_numeric_feature(
&mut feature_data,
property_data,
@ -281,11 +273,50 @@ fn build_filter_feature_data(
);
info!(
rows = postcode.len(),
joined_rows, "Actual listings joined to property feature matrix"
rows = row_count,
encoded_columns, "Actual listings feature matrix read from enriched parquet"
);
feature_data
Ok(feature_data)
}
fn build_poi_filter_feature_data(
df: &DataFrame,
property_data: Option<&PropertyData>,
) -> Result<Vec<u16>> {
let Some(property_data) = property_data else {
return Ok(Vec::new());
};
let poi_metrics = &property_data.poi_metrics;
let num_features = poi_metrics.num_features();
if num_features == 0 {
return Ok(Vec::new());
}
let row_count = df.height();
let mut feature_data = vec![NAN_U16; row_count * num_features];
let quant = poi_metrics.quant_ref();
let mut encoded_columns = 0usize;
for (metric_idx, name) in poi_metrics.feature_names.iter().enumerate() {
let Some(values) = extract_optional_feature_f32(df, name)? else {
continue;
};
for (row, value) in values.into_iter().enumerate() {
let dst = row * num_features + metric_idx;
feature_data[dst] = value
.map(|value| encode_numeric_value(&quant, metric_idx, value))
.unwrap_or(NAN_U16);
}
encoded_columns += 1;
}
info!(
rows = row_count,
encoded_columns, "Actual listings POI metrics read from enriched parquet"
);
Ok(feature_data)
}
fn feature_index(property_data: &PropertyData, name: &str) -> Option<usize> {
@ -323,6 +354,53 @@ fn overlay_numeric_feature<I>(
}
}
fn encode_numeric_feature<I>(
feature_data: &mut [u16],
property_data: &PropertyData,
quant: &QuantRef<'_>,
feat_idx: usize,
values: I,
) where
I: IntoIterator<Item = Option<f32>>,
{
let num_features = property_data.num_features;
for (row, value) in values.into_iter().enumerate() {
let dst = row * num_features + feat_idx;
feature_data[dst] = value
.map(|value| encode_numeric_value(quant, feat_idx, value))
.unwrap_or(NAN_U16);
}
}
fn extract_optional_feature_f32(df: &DataFrame, name: &str) -> Result<Option<Vec<Option<f32>>>> {
let Ok(column) = df.column(name) else {
return Ok(None);
};
if matches!(column.dtype(), DataType::Datetime(_, _) | DataType::Date) {
let projected = df
.clone()
.lazy()
.select([(col(name).dt().year().cast(DataType::Float32)
+ (col(name).dt().month().cast(DataType::Float32) - lit(1.0f32)) / lit(12.0f32))
.alias("__feature")])
.collect()
.with_context(|| format!("Failed to convert datetime feature '{name}'"))?;
return Ok(Some(extract_opt_f32(&projected, "__feature")?));
}
let cast = column
.cast(&DataType::Float32)
.with_context(|| format!("Failed to cast feature '{name}' to Float32"))?;
let values = cast
.f32()
.with_context(|| format!("Feature '{name}' is not Float32"))?
.into_iter()
.map(|value| value.filter(|v| v.is_finite()))
.collect();
Ok(Some(values))
}
fn overlay_enum_feature<'a, I>(
feature_data: &mut [u16],
property_data: &PropertyData,
@ -355,6 +433,46 @@ fn overlay_enum_feature<'a, I>(
}
}
fn encode_enum_feature(
feature_data: &mut [u16],
property_data: &PropertyData,
feat_idx: usize,
values: Vec<Option<String>>,
) {
let Some(enum_values) = property_data.enum_values.get(&feat_idx) else {
return;
};
let num_features = property_data.num_features;
for (row, value) in values.into_iter().enumerate() {
let dst = row * num_features + feat_idx;
feature_data[dst] = value
.as_deref()
.map(str::trim)
.filter(|text| !text.is_empty())
.and_then(|text| enum_values.iter().position(|candidate| candidate == text))
.map(|position| position as u16)
.unwrap_or(NAN_U16);
}
}
fn extract_optional_feature_str(df: &DataFrame, name: &str) -> Result<Option<Vec<Option<String>>>> {
let Ok(column) = df.column(name) else {
return Ok(None);
};
let cast = column
.cast(&DataType::String)
.with_context(|| format!("Failed to cast feature '{name}' to String"))?;
let strings = cast
.str()
.with_context(|| format!("Feature '{name}' is not a string column"))?;
Ok(Some(
strings
.into_iter()
.map(|value| value.and_then(|text| (!text.trim().is_empty()).then(|| text.to_string())))
.collect(),
))
}
fn encode_numeric_value(quant: &QuantRef<'_>, feat_idx: usize, value: f32) -> u16 {
if !value.is_finite() {
return NAN_U16;
@ -517,8 +635,13 @@ mod tests {
use std::path::PathBuf;
fn sample_path() -> Option<PathBuf> {
let path = PathBuf::from("../finder/data/online_listings_buy.parquet");
path.exists().then_some(path)
[
"../finder/data/online_listings_buy_enriched.parquet",
"../finder/data/online_listings_buy.parquet",
]
.into_iter()
.map(PathBuf::from)
.find(|path| path.exists())
}
#[test]

View file

@ -63,7 +63,20 @@ const DASHBOARD_POI_GROUPS: &[(&str, &[&str])] = &[
("Groceries", GROCERY_DASHBOARD_CATEGORIES),
("Food & Drink", &["Café", "Restaurant", "Pub", "Fast Food"]),
("Green Space", &["Park", "Playground"]),
("Education", &["School"]),
(
"Education",
&[
"Nursery school",
"Primary school",
"Secondary school",
"All-through school",
"Sixth form",
"Further education college",
"University",
"Special school",
"School",
],
),
(
"Health",
&["GP Surgery", "Pharmacy", "Dentist", "Hospital & Clinic"],
@ -119,6 +132,21 @@ fn canonical_poi_category(category: &str) -> &str {
}
}
/// Categories the pipeline emits for the GIAS-derived school POIs. A bare
/// `poi=School` URL (predating the per-phase split) is expanded to all of these
/// so bookmarked links keep showing schools.
const SCHOOL_CATEGORY_ALIASES: &[&str] = &[
"Nursery school",
"Primary school",
"Secondary school",
"All-through school",
"Sixth form",
"Further education college",
"University",
"Special school",
"School",
];
pub fn resolve_poi_category_filter(category_values: &[String], categories: &str) -> FxHashSet<u16> {
let mut selected = FxHashSet::default();
for part in categories.split(',') {
@ -126,6 +154,12 @@ pub fn resolve_poi_category_filter(category_values: &[String], categories: &str)
if category.is_empty() {
continue;
}
if category == "School" {
for alias in SCHOOL_CATEGORY_ALIASES {
add_category_filter_index(category_values, alias, &mut selected);
}
continue;
}
add_category_filter_index(category_values, category, &mut selected);
}
selected
@ -174,6 +208,8 @@ pub struct SchoolMetadata {
pub telephone: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub head_name: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub ofsted_rating: Option<String>,
}
pub struct POIData {
@ -350,6 +386,8 @@ fn build_school_meta(
let website = extract_optional_str_col(df, "school_website")?.unwrap_or_default();
let telephone = extract_optional_str_col(df, "school_telephone")?.unwrap_or_default();
let head_name = extract_optional_str_col(df, "school_head_name")?.unwrap_or_default();
let ofsted_rating =
extract_optional_str_col(df, "school_ofsted_rating")?.unwrap_or_default();
let fetch_str = |col: &Vec<Option<String>>, row: usize| -> Option<String> {
col.get(row).cloned().flatten()
@ -390,6 +428,7 @@ fn build_school_meta(
website: fetch_str(&website, row),
telephone: fetch_str(&telephone, row),
head_name: fetch_str(&head_name, row),
ofsted_rating: fetch_str(&ofsted_rating, row),
});
}
Ok((idx, meta))
@ -578,6 +617,26 @@ mod tests {
assert!(selected.is_empty());
}
#[test]
fn legacy_school_filter_expands_to_all_school_categories() {
// Bookmarked URLs from before the per-phase split sent `poi=School`;
// they should still match every school category that's loaded.
let values = vec![
"Primary school".to_string(),
"Secondary school".to_string(),
"University".to_string(),
"Tesco".to_string(),
];
let selected = resolve_poi_category_filter(&values, "School");
assert!(selected.contains(&0));
assert!(selected.contains(&1));
assert!(selected.contains(&2));
assert!(!selected.contains(&3));
assert_eq!(selected.len(), 3);
}
#[test]
fn coop_category_aliases_resolve_to_single_category() {
let values = vec!["Co-op".to_string(), "Tesco".to_string()];

View file

@ -891,6 +891,15 @@ impl PropertyData {
(&self.postcode_interner, &self.postcode_keys)
}
/// Property rows for a given postcode string, or empty if unknown.
pub fn rows_for_postcode(&self, postcode: &str) -> &[u32] {
self.postcode_interner
.get(postcode)
.and_then(|key| self.postcode_row_index.get(&key))
.map(Vec::as_slice)
.unwrap_or(&[])
}
fn row_address_search_tokens(&self, row: usize) -> &[lasso::Spur] {
let offset = self.address_search_token_offsets[row] as usize;
let length = self.address_search_token_lengths[row] as usize;

View file

@ -426,21 +426,6 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
raw: false,
absolute: false,
}),
Feature::Numeric(FeatureConfig {
name: "Education, Skills and Training Score",
bounds: Bounds::Fixed {
min: 0.0,
max: 100.0,
},
step: 1.0,
description: "Education and skills deprivation percentile (higher = less deprived)",
detail: "From the English Indices of Deprivation, converted to a national percentile where 0% is most deprived and 100% is least deprived. Covers school attainment, entry to higher education, adult qualifications, and English language proficiency.",
source: "iod",
prefix: "",
suffix: "%",
raw: true,
absolute: true,
}),
],
},
FeatureGroup {
@ -476,6 +461,21 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
raw: true,
absolute: true,
}),
Feature::Numeric(FeatureConfig {
name: "Education, Skills and Training Score",
bounds: Bounds::Fixed {
min: 0.0,
max: 100.0,
},
step: 1.0,
description: "Education and skills deprivation percentile (higher = less deprived)",
detail: "From the English Indices of Deprivation, converted to a national percentile where 0% is most deprived and 100% is least deprived. Covers school attainment, entry to higher education, adult qualifications, and English language proficiency.",
source: "iod",
prefix: "",
suffix: "%",
raw: true,
absolute: true,
}),
Feature::Numeric(FeatureConfig {
name: "Health Deprivation and Disability Score",
bounds: Bounds::Fixed {

View file

@ -13,12 +13,11 @@ use crate::consts::NAN_U16;
use crate::data::ActualListing;
use crate::licensing::{check_license_bounds, resolve_share_code};
use crate::parsing::{
parse_filters_with_poi, require_bounds, row_passes_filters, row_passes_poi_filters,
ParsedEnumFilter, ParsedFilter,
parse_filters_with_poi, require_bounds, ParsedEnumFilter, ParsedFilter, ParsedPoiFilter,
};
use crate::state::{AppState, SharedState};
use super::travel_time::{parse_optional_travel, row_passes_travel_filters, TravelEntry};
use super::travel_time::{load_travel_data, parse_optional_travel, row_passes_travel_filters};
#[derive(Deserialize)]
pub struct ActualListingsParams {
@ -41,17 +40,6 @@ pub struct ActualListingsResponse {
pub truncated: bool,
}
const LISTING_LEVEL_FILTER_FEATURES: &[&str] = &[
"Property type",
"Leasehold/Freehold",
"Total floor area (sqm)",
"Number of bedrooms & living rooms",
"Estimated current price",
"Last known price",
"Est. price per sqm",
"Price per sqm",
];
const KEEP_UNKNOWN_LISTING_FILTER_FEATURES: &[&str] = &["Total floor area (sqm)"];
pub async fn get_actual_listings(
@ -90,38 +78,23 @@ pub async fn get_actual_listings(
let travel_entries = parse_optional_travel(params.travel.as_deref())
.map_err(|err| ApiError::BadRequest(err).into_response())?;
let listing_level_feature_idxs = listing_level_filter_feature_idxs(&state);
let keep_unknown_listing_filter_idxs = keep_unknown_listing_filter_feature_idxs(&state);
let (listing_filters, postcode_filters) =
split_numeric_filters(parsed_filters, &listing_level_feature_idxs);
let (listing_enum_filters, postcode_enum_filters) =
split_enum_filters(parsed_enum_filters, &listing_level_feature_idxs);
let listing_filters = parsed_filters;
let listing_enum_filters = parsed_enum_filters;
let has_postcode_filters = !postcode_filters.is_empty()
|| !postcode_enum_filters.is_empty()
|| !parsed_poi_filters.is_empty()
|| !travel_entries.is_empty();
let has_listing_filters = !listing_filters.is_empty() || !listing_enum_filters.is_empty();
let state_clone = state.clone();
let response =
tokio::task::spawn_blocking(move || -> Result<ActualListingsResponse, String> {
let t0 = std::time::Instant::now();
let passing_postcodes = if has_postcode_filters {
Some(compute_passing_postcodes(
&state_clone,
south,
west,
north,
east,
&postcode_filters,
&postcode_enum_filters,
&parsed_poi_filters,
&travel_entries,
)?)
let has_poi_filters = !parsed_poi_filters.is_empty();
let has_travel_filters = !travel_entries.is_empty();
let poi_num_features = state_clone.data.poi_metrics.num_features();
let travel_data = if has_travel_filters {
load_travel_data(&state_clone.travel_time_store, &travel_entries)?
} else {
None
Vec::new()
};
let row_indices = actual_listings.grid.query(south, west, north, east);
@ -133,11 +106,6 @@ pub async fn get_actual_listings(
.iter()
.filter_map(|&row_idx| {
let row = row_idx as usize;
if let Some(allowed) = passing_postcodes.as_ref() {
if !allowed.contains(actual_listings.postcode[row].as_str()) {
return None;
}
}
if has_listing_filters
&& !row_passes_listing_filters(
row,
@ -150,6 +118,25 @@ pub async fn get_actual_listings(
{
return None;
}
if has_poi_filters
&& !row_passes_listing_poi_filters(
row,
&parsed_poi_filters,
&actual_listings.poi_filter_feature_data,
poi_num_features,
)
{
return None;
}
if has_travel_filters
&& !row_passes_travel_filters(
actual_listings.postcode[row].as_str(),
&travel_entries,
&travel_data,
)
{
return None;
}
Some(row)
})
.collect();
@ -176,8 +163,9 @@ pub async fn get_actual_listings(
total = total_matching,
total_in_bounds,
offset,
postcode_filtered = passing_postcodes.is_some(),
listing_filtered = has_listing_filters,
poi_filtered = has_poi_filters,
travel_filtered = has_travel_filters,
ms = format_args!("{:.1}", elapsed.as_secs_f64() * 1000.0),
"GET /api/actual-listings"
);
@ -196,10 +184,6 @@ pub async fn get_actual_listings(
Ok(Json(response))
}
fn listing_level_filter_feature_idxs(state: &AppState) -> FxHashSet<usize> {
feature_idxs(state, LISTING_LEVEL_FILTER_FEATURES)
}
fn keep_unknown_listing_filter_feature_idxs(state: &AppState) -> FxHashSet<usize> {
feature_idxs(state, KEEP_UNKNOWN_LISTING_FILTER_FEATURES)
}
@ -211,38 +195,6 @@ fn feature_idxs(state: &AppState, names: &[&str]) -> FxHashSet<usize> {
.collect()
}
fn split_numeric_filters(
filters: Vec<ParsedFilter>,
listing_level_feature_idxs: &FxHashSet<usize>,
) -> (Vec<ParsedFilter>, Vec<ParsedFilter>) {
let mut listing_filters = Vec::new();
let mut postcode_filters = Vec::new();
for filter in filters {
if listing_level_feature_idxs.contains(&filter.feat_idx) {
listing_filters.push(filter);
} else {
postcode_filters.push(filter);
}
}
(listing_filters, postcode_filters)
}
fn split_enum_filters(
filters: Vec<ParsedEnumFilter>,
listing_level_feature_idxs: &FxHashSet<usize>,
) -> (Vec<ParsedEnumFilter>, Vec<ParsedEnumFilter>) {
let mut listing_filters = Vec::new();
let mut postcode_filters = Vec::new();
for filter in filters {
if listing_level_feature_idxs.contains(&filter.feat_idx) {
listing_filters.push(filter);
} else {
postcode_filters.push(filter);
}
}
(listing_filters, postcode_filters)
}
fn row_passes_listing_filters(
row: usize,
filters: &[ParsedFilter],
@ -266,132 +218,33 @@ fn row_passes_listing_filters(
})
}
#[allow(clippy::too_many_arguments)]
fn compute_passing_postcodes(
state: &AppState,
south: f64,
west: f64,
north: f64,
east: f64,
parsed_filters: &[crate::parsing::ParsedFilter],
parsed_enum_filters: &[crate::parsing::ParsedEnumFilter],
parsed_poi_filters: &[crate::parsing::ParsedPoiFilter],
travel_entries: &[TravelEntry],
) -> Result<FxHashSet<String>, String> {
let num_features = state.data.num_features;
let feature_data = &state.data.feature_data;
let poi_metrics = &state.data.poi_metrics;
let has_poi_filters = !parsed_poi_filters.is_empty();
fn row_passes_listing_poi_filters(
row: usize,
filters: &[ParsedPoiFilter],
feature_data: &[u16],
num_features: usize,
) -> bool {
if filters.is_empty() {
return true;
}
if num_features == 0 || feature_data.is_empty() {
return false;
}
let travel_data = if travel_entries.is_empty() {
Vec::new()
} else {
let store = &state.travel_time_store;
travel_entries
.iter()
.map(|entry| {
store
.get(&entry.mode, &entry.slug)
.map_err(|err| format!("Failed to load travel data: {}", err))
})
.collect::<Result<Vec<_>, _>>()?
};
let has_travel = !travel_entries.is_empty();
let mut passing: FxHashSet<String> = FxHashSet::default();
state
.grid
.for_each_in_bounds(south, west, north, east, |row_idx| {
let row = row_idx as usize;
if !row_passes_filters(
row,
parsed_filters,
parsed_enum_filters,
feature_data,
num_features,
) {
return;
}
if has_poi_filters && !row_passes_poi_filters(row, parsed_poi_filters, poi_metrics) {
return;
}
let postcode = state.data.postcode(row);
if has_travel && !row_passes_travel_filters(postcode, travel_entries, &travel_data) {
return;
}
// Property postcodes share the same canonical "OUT IN" format used by
// ActualListingData::load (normalize_postcode), so we can match by string.
if !passing.contains(postcode) {
passing.insert(postcode.to_string());
}
});
Ok(passing)
let base = row * num_features;
filters.iter().all(|filter| {
let raw = feature_data
.get(base + filter.metric_idx)
.copied()
.unwrap_or(NAN_U16);
raw != NAN_U16 && raw >= filter.min_u16 && raw <= filter.max_u16
})
}
#[cfg(test)]
mod tests {
use super::*;
fn numeric_filter(feat_idx: usize) -> ParsedFilter {
ParsedFilter {
feat_idx,
min_u16: 0,
max_u16: 100,
}
}
fn enum_filter(feat_idx: usize) -> ParsedEnumFilter {
ParsedEnumFilter {
feat_idx,
allowed: [0u16].into_iter().collect(),
}
}
#[test]
fn splits_actual_listing_filters_by_listing_native_features() {
let listing_level_feature_idxs: FxHashSet<usize> = [1usize, 3].into_iter().collect();
let (listing_filters, postcode_filters) = split_numeric_filters(
vec![numeric_filter(0), numeric_filter(1), numeric_filter(3)],
&listing_level_feature_idxs,
);
assert_eq!(
listing_filters
.iter()
.map(|filter| filter.feat_idx)
.collect::<Vec<_>>(),
vec![1, 3]
);
assert_eq!(
postcode_filters
.iter()
.map(|filter| filter.feat_idx)
.collect::<Vec<_>>(),
vec![0]
);
let (listing_enum_filters, postcode_enum_filters) = split_enum_filters(
vec![enum_filter(2), enum_filter(3)],
&listing_level_feature_idxs,
);
assert_eq!(
listing_enum_filters
.iter()
.map(|filter| filter.feat_idx)
.collect::<Vec<_>>(),
vec![3]
);
assert_eq!(
postcode_enum_filters
.iter()
.map(|filter| filter.feat_idx)
.collect::<Vec<_>>(),
vec![2]
);
}
#[test]
fn listing_floor_area_filter_keeps_unknown_values() {
let floor_area_filter = ParsedFilter {
@ -436,4 +289,30 @@ mod tests {
&keep_unknown_filter_idxs
));
}
#[test]
fn listing_poi_filter_uses_listing_metric_matrix() {
let filter = ParsedPoiFilter {
metric_idx: 1,
min_u16: 10,
max_u16: 20,
};
assert!(row_passes_listing_poi_filters(
0,
&[filter],
&[NAN_U16, 15],
2
));
assert!(!row_passes_listing_poi_filters(
0,
&[ParsedPoiFilter {
metric_idx: 1,
min_u16: 10,
max_u16: 20,
}],
&[NAN_U16, NAN_U16],
2
));
}
}

View file

@ -18,14 +18,15 @@ use crate::data::{PostcodePoiMetrics, QuantRef};
use crate::features;
use crate::licensing::{check_license_bounds, resolve_share_code};
use crate::parsing::{
parse_field_indices_with_poi, parse_filters_with_poi, require_bounds, row_passes_filters,
row_passes_poi_filters,
parse_bounds, parse_field_indices_with_poi, parse_filters_with_poi, row_passes_filters,
row_passes_poi_filters, ParsedEnumFilter, ParsedFilter, ParsedPoiFilter,
};
use crate::routes::travel_time::{
load_travel_data, parse_optional_travel, row_passes_travel_filters,
};
use crate::routes::{fetch_screenshot_bytes, FeatureInfo};
use crate::state::SharedState;
use crate::utils::normalize_postcode;
const MAX_EXPORT_POSTCODES: usize = 250;
const EXPORT_SCREENSHOT_TIMEOUT_SECS: u64 = 12;
@ -46,6 +47,9 @@ pub struct ExportParams {
travel: Option<String>,
fields: Option<String>,
share: Option<String>,
/// Comma-separated list of postcodes for list-mode export. When supplied,
/// the bounds / filters / travel parameters are ignored.
postcodes: Option<String>,
}
/// Per-postcode accumulator for export aggregation (mean for numeric, mode for enum).
@ -193,6 +197,94 @@ fn collect_overlay_state_params(query: Option<&str>) -> Vec<String> {
collect_repeated_state_params(query, "overlay")
}
/// A parsed, deduplicated, validated list of postcodes to export.
struct ParsedPostcodeList {
/// Resolved (postcode index, normalized postcode) pairs, preserving input order.
entries: Vec<(usize, String)>,
/// Postcodes the user supplied that were not found in the dataset.
unknown: Vec<String>,
}
fn parse_postcode_list(
raw: &str,
state: &crate::state::AppState,
) -> Result<ParsedPostcodeList, axum::response::Response> {
let mut entries: Vec<(usize, String)> = Vec::new();
let mut unknown: Vec<String> = Vec::new();
let mut seen: FxHashSet<usize> = FxHashSet::default();
for raw_pc in raw.split([',', '\n', ';']) {
let trimmed = raw_pc.trim();
if trimmed.is_empty() {
continue;
}
let normalized = normalize_postcode(trimmed);
if normalized.is_empty() {
continue;
}
if entries.len() >= MAX_EXPORT_POSTCODES {
return Err((
StatusCode::BAD_REQUEST,
format!(
"Too many postcodes; at most {} are supported per export",
MAX_EXPORT_POSTCODES
),
)
.into_response());
}
match state.postcode_data.postcode_to_idx.get(&normalized) {
Some(&pc_idx) if seen.insert(pc_idx) => {
entries.push((pc_idx, normalized));
}
Some(_) => {} // duplicate — skip silently
None => unknown.push(normalized),
}
}
if entries.is_empty() {
return Err((
StatusCode::BAD_REQUEST,
"No valid postcodes supplied".to_string(),
)
.into_response());
}
Ok(ParsedPostcodeList { entries, unknown })
}
/// Tight bounding box around a set of postcode centroids (used for license checks).
fn bounds_for_postcode_indices(
indices: &[usize],
centroids: &[(f32, f32)],
) -> (f64, f64, f64, f64) {
let mut south = f64::INFINITY;
let mut west = f64::INFINITY;
let mut north = f64::NEG_INFINITY;
let mut east = f64::NEG_INFINITY;
for &idx in indices {
if let Some(&(lat, lon)) = centroids.get(idx) {
let lat = lat as f64;
let lon = lon as f64;
if lat < south {
south = lat;
}
if lat > north {
north = lat;
}
if lon < west {
west = lon;
}
if lon > east {
east = lon;
}
}
}
if !south.is_finite() {
return (0.0, 0.0, 0.0, 0.0);
}
(south, west, north, east)
}
pub async fn get_export(
State(shared): State<Arc<SharedState>>,
headers: HeaderMap,
@ -201,16 +293,42 @@ pub async fn get_export(
Query(params): Query<ExportParams>,
) -> Result<impl IntoResponse, axum::response::Response> {
let state = shared.load_state();
let (south, west, north, east) =
require_bounds(params.bounds).map_err(IntoResponse::into_response)?;
let area_deg2 = (north - south).max(0.0) * (east - west).max(0.0);
if area_deg2 > MAX_EXPORT_BBOX_AREA_DEG2 {
return Err((
StatusCode::BAD_REQUEST,
"Export area is too large; zoom in further before exporting",
)
.into_response());
// Two modes: bounds-based (default) and explicit postcode list.
let postcode_list = match params.postcodes.as_deref() {
Some(raw) if !raw.trim().is_empty() => Some(parse_postcode_list(raw, &state)?),
_ => None,
};
let is_postcode_mode = postcode_list.is_some();
if let Some(list) = postcode_list.as_ref() {
if !list.unknown.is_empty() {
warn!(unknown = ?list.unknown, "Export: unknown postcodes ignored");
}
}
let (south, west, north, east) = if let Some(list) = postcode_list.as_ref() {
let idxs: Vec<usize> = list.entries.iter().map(|(i, _)| *i).collect();
bounds_for_postcode_indices(&idxs, &state.postcode_data.centroids)
} else {
let raw = params.bounds.clone().ok_or_else(|| {
(
StatusCode::BAD_REQUEST,
"bounds or postcodes parameter is required",
)
.into_response()
})?;
parse_bounds(&raw).map_err(IntoResponse::into_response)?
};
if !is_postcode_mode {
let area_deg2 = (north - south).max(0.0) * (east - west).max(0.0);
if area_deg2 > MAX_EXPORT_BBOX_AREA_DEG2 {
return Err((
StatusCode::BAD_REQUEST,
"Export area is too large; zoom in further before exporting",
)
.into_response());
}
}
let share_bounds = resolve_share_code(&state, params.share.as_deref()).await;
@ -218,24 +336,44 @@ pub async fn get_export(
let quant = state.data.quant_ref();
let poi_quant = state.data.poi_metrics.quant_ref();
let (parsed_filters, parsed_enum_filters, parsed_poi_filters) = parse_filters_with_poi(
params.filters.as_deref(),
&state.feature_name_to_index,
&state.data.enum_values,
&quant,
&state.data.poi_metrics.name_to_index,
&poi_quant,
)
.map_err(|err| (StatusCode::BAD_REQUEST, err).into_response())?;
let (parsed_filters, parsed_enum_filters, parsed_poi_filters): (
Vec<ParsedFilter>,
Vec<ParsedEnumFilter>,
Vec<ParsedPoiFilter>,
) = if is_postcode_mode {
(Vec::new(), Vec::new(), Vec::new())
} else {
parse_filters_with_poi(
params.filters.as_deref(),
&state.feature_name_to_index,
&state.data.enum_values,
&quant,
&state.data.poi_metrics.name_to_index,
&poi_quant,
)
.map_err(|err| (StatusCode::BAD_REQUEST, err).into_response())?
};
let has_poi_filters = !parsed_poi_filters.is_empty();
let filters_str = params.filters;
let travel_entries = parse_optional_travel(params.travel.as_deref())
.map_err(|err| (StatusCode::BAD_REQUEST, err).into_response())?;
let filters_str = if is_postcode_mode { None } else { params.filters };
let travel_entries = if is_postcode_mode {
Vec::new()
} else {
parse_optional_travel(params.travel.as_deref())
.map_err(|err| (StatusCode::BAD_REQUEST, err).into_response())?
};
let has_travel_filters = travel_entries
.iter()
.any(|entry| entry.filter_min.is_some() && entry.filter_max.is_some());
let travel_state_params = collect_travel_state_params(uri.query());
let overlay_state_params = collect_overlay_state_params(uri.query());
let travel_state_params = if is_postcode_mode {
Vec::new()
} else {
collect_travel_state_params(uri.query())
};
let overlay_state_params = if is_postcode_mode {
Vec::new()
} else {
collect_overlay_state_params(uri.query())
};
let fields_str = params.fields;
let share_code = params.share;
@ -260,29 +398,34 @@ pub async fn get_export(
share_code.as_deref(),
);
// Fetch screenshot (async, before spawn_blocking)
let auth_header = headers.get(header::AUTHORIZATION);
let screenshot_fetch = fetch_screenshot_bytes(&state, &frontend_params, auth_header);
let screenshot_bytes = match tokio::time::timeout(
Duration::from_secs(EXPORT_SCREENSHOT_TIMEOUT_SECS),
screenshot_fetch,
)
.await
{
Ok(Ok(bytes)) => {
info!(bytes = bytes.len(), "Fetched screenshot for export");
Some(bytes)
}
Ok(Err(err)) => {
warn!("Screenshot failed for export: {err}");
None
}
Err(_) => {
warn!(
timeout_secs = EXPORT_SCREENSHOT_TIMEOUT_SECS,
"Screenshot timed out for export"
);
None
// Screenshot only makes sense for the spatial / filter mode. In list mode the
// map view is unrelated to the selected postcodes, so we skip it.
let screenshot_bytes = if is_postcode_mode {
None
} else {
let auth_header = headers.get(header::AUTHORIZATION);
let screenshot_fetch = fetch_screenshot_bytes(&state, &frontend_params, auth_header);
match tokio::time::timeout(
Duration::from_secs(EXPORT_SCREENSHOT_TIMEOUT_SECS),
screenshot_fetch,
)
.await
{
Ok(Ok(bytes)) => {
info!(bytes = bytes.len(), "Fetched screenshot for export");
Some(bytes)
}
Ok(Err(err)) => {
warn!("Screenshot failed for export: {err}");
None
}
Err(_) => {
warn!(
timeout_secs = EXPORT_SCREENSHOT_TIMEOUT_SECS,
"Screenshot timed out for export"
);
None
}
}
};
@ -302,6 +445,9 @@ pub async fn get_export(
})
.collect();
let postcode_list_entries: Option<Vec<(usize, String)>> =
postcode_list.map(|list| list.entries);
let bytes = tokio::task::spawn_blocking(move || -> Result<Vec<u8>, String> {
let t0 = std::time::Instant::now();
let num_features = state.data.num_features;
@ -319,75 +465,102 @@ pub async fn get_export(
// Build set of enum feature indices for quick lookup
let enum_indices: FxHashMap<usize, ()> = enum_values.keys().map(|&idx| (idx, ())).collect();
// Aggregate directly by postcode so large requests don't retain every
// matching property row before sampling the exported postcodes.
let mut postcode_aggs: FxHashMap<usize, PostcodeExportAgg> = FxHashMap::default();
state
.grid
.for_each_in_bounds(south, west, north, east, |row_idx| {
let row = row_idx as usize;
if !row_passes_filters(
row,
&parsed_filters,
&parsed_enum_filters,
feature_data,
num_features,
) {
return;
}
if has_poi_filters && !row_passes_poi_filters(row, &parsed_poi_filters, poi_metrics)
{
return;
}
let postcode = pc_interner.resolve(&pc_keys[row]);
if has_travel_filters
&& !row_passes_travel_filters(postcode, &travel_entries, &travel_data)
{
return;
}
if let Some(&pc_idx) = postcode_data.postcode_to_idx.get(postcode) {
postcode_aggs
.entry(pc_idx)
.or_insert_with(|| PostcodeExportAgg::new(total_export_features))
.add_row(
let (postcode_aggs, was_sampled): (Vec<(usize, PostcodeExportAgg)>, bool) =
if let Some(entries) = postcode_list_entries.as_ref() {
// List mode: iterate property rows for each requested postcode and
// produce results in the order the user supplied them.
let mut out: Vec<(usize, PostcodeExportAgg)> = Vec::with_capacity(entries.len());
for (pc_idx, _normalized) in entries {
let mut agg = PostcodeExportAgg::new(total_export_features);
for &row_idx in state.data.rows_for_postcode(
&postcode_data.postcodes[*pc_idx],
) {
agg.add_row(
feature_data,
row,
row_idx as usize,
num_features,
&enum_indices,
&quant,
poi_metrics,
);
}
if agg.count > 0 {
out.push((*pc_idx, agg));
}
}
});
(out, false)
} else {
// Bounds mode: aggregate directly by postcode so large requests
// don't retain every matching property row before sampling.
let mut by_pc: FxHashMap<usize, PostcodeExportAgg> = FxHashMap::default();
state
.grid
.for_each_in_bounds(south, west, north, east, |row_idx| {
let row = row_idx as usize;
if !row_passes_filters(
row,
&parsed_filters,
&parsed_enum_filters,
feature_data,
num_features,
) {
return;
}
if has_poi_filters
&& !row_passes_poi_filters(row, &parsed_poi_filters, poi_metrics)
{
return;
}
let postcode = pc_interner.resolve(&pc_keys[row]);
if has_travel_filters
&& !row_passes_travel_filters(postcode, &travel_entries, &travel_data)
{
return;
}
if let Some(&pc_idx) = postcode_data.postcode_to_idx.get(postcode) {
by_pc.entry(pc_idx)
.or_insert_with(|| PostcodeExportAgg::new(total_export_features))
.add_row(
feature_data,
row,
num_features,
&enum_indices,
&quant,
poi_metrics,
);
}
});
let mut postcode_aggs: Vec<(usize, PostcodeExportAgg)> = postcode_aggs
.into_iter()
.filter(|(_, agg)| agg.count > 0)
.collect();
let mut aggs: Vec<(usize, PostcodeExportAgg)> = by_pc
.into_iter()
.filter(|(_, agg)| agg.count > 0)
.collect();
// Sort by property count descending
postcode_aggs.sort_unstable_by_key(|agg| std::cmp::Reverse(agg.1.count));
// Sort by property count descending
aggs.sort_unstable_by_key(|agg| std::cmp::Reverse(agg.1.count));
// Sample if too many postcodes
let was_sampled = postcode_aggs.len() > MAX_EXPORT_POSTCODES;
if was_sampled {
let mut hasher = DefaultHasher::new();
south.to_bits().hash(&mut hasher);
west.to_bits().hash(&mut hasher);
north.to_bits().hash(&mut hasher);
east.to_bits().hash(&mut hasher);
let seed = hasher.finish();
let was_sampled = aggs.len() > MAX_EXPORT_POSTCODES;
if was_sampled {
let mut hasher = DefaultHasher::new();
south.to_bits().hash(&mut hasher);
west.to_bits().hash(&mut hasher);
north.to_bits().hash(&mut hasher);
east.to_bits().hash(&mut hasher);
let seed = hasher.finish();
let len = postcode_aggs.len();
for pick in 0..MAX_EXPORT_POSTCODES {
let swap_idx = pick
+ ((seed.wrapping_mul(pick as u64 + 1).wrapping_add(pick as u64)) as usize
% (len - pick));
postcode_aggs.swap(pick, swap_idx);
}
postcode_aggs.truncate(MAX_EXPORT_POSTCODES);
postcode_aggs.sort_unstable_by_key(|agg| std::cmp::Reverse(agg.1.count));
}
let len = aggs.len();
for pick in 0..MAX_EXPORT_POSTCODES {
let swap_idx = pick
+ ((seed.wrapping_mul(pick as u64 + 1).wrapping_add(pick as u64))
as usize
% (len - pick));
aggs.swap(pick, swap_idx);
}
aggs.truncate(MAX_EXPORT_POSTCODES);
aggs.sort_unstable_by_key(|agg| std::cmp::Reverse(agg.1.count));
}
(aggs, was_sampled)
};
// Determine column order: filter features first, then remaining
let filter_feature_names = extract_filter_feature_names(filters_str.as_deref());
@ -545,12 +718,18 @@ pub async fn get_export(
frontend_params
);
// Sheet 1: "Selected" (filter features only) with link + screenshot
// Sheet 2: "All Data" (all features)
let sheet_configs: [(&str, &[usize], bool); 2] = [
("Selected", &filter_feature_indices, true),
("All Data", &all_feature_indices, false),
];
// Bounds mode: two sheets — "Selected" (filter features with link + screenshot)
// and "All Data" (all features).
// List mode: single sheet "Postcodes" with all data, no link or screenshot
// (the supplied list isn't tied to a map view).
let sheet_configs: Vec<(&str, &[usize], bool)> = if postcode_list_entries.is_some() {
vec![("Postcodes", &all_feature_indices, false)]
} else {
vec![
("Selected", &filter_feature_indices, true),
("All Data", &all_feature_indices, false),
]
};
for (sheet_name, feat_indices, include_header) in &sheet_configs {
let sheet = workbook.add_worksheet();