All good
Some checks failed
CI / Check (push) Has been cancelled
Build and publish Docker image / build-and-push (push) Has been cancelled

This commit is contained in:
Andras Schmelczer 2026-05-18 21:20:10 +01:00
parent 6ea544a0f6
commit 6cc7288126
45 changed files with 929 additions and 1043 deletions

View file

@ -6,6 +6,8 @@ use polars::prelude::*;
use serde::Serialize;
use tracing::info;
use crate::consts::{NAN_U16, QUANT_SCALE};
use crate::data::{PropertyData, QuantRef};
use crate::utils::{normalize_postcode, GridIndex, InternedColumn};
const GRID_CELL_SIZE: f32 = 0.01;
@ -52,15 +54,22 @@ pub struct ActualListingData {
pub listing_status: InternedColumn,
pub listing_date_iso: Vec<Option<String>>,
pub features: Vec<Vec<String>>,
/// Row-major feature matrix aligned with PropertyData::feature_names.
///
/// Rows start from a best-effort address/postcode join to the historical property
/// dataset, then live listing fields such as asking price and property type are
/// overlaid where available. This lets the listings endpoint use the same filter
/// execution path as the property endpoints.
pub filter_feature_data: Vec<u16>,
pub grid: GridIndex,
}
impl ActualListingData {
pub fn load(parquet_path: &Path) -> Result<Self> {
super::run_polars_io(|| Self::load_inner(parquet_path))
pub fn load(parquet_path: &Path, property_data: &PropertyData) -> Result<Self> {
super::run_polars_io(|| Self::load_inner(parquet_path, Some(property_data)))
}
fn load_inner(parquet_path: &Path) -> Result<Self> {
fn load_inner(parquet_path: &Path, property_data: Option<&PropertyData>) -> Result<Self> {
info!("Loading actual listings from {:?}", parquet_path);
let pl_path = PlRefPath::try_from_path(parquet_path)
.context("Failed to normalize actual listings parquet path")?;
@ -99,6 +108,18 @@ impl ActualListingData {
let price_qualifier = InternedColumn::build(&opt_to_string(&price_qualifier_raw));
let listing_status = InternedColumn::build(&opt_to_string(&listing_status_raw));
let filter_feature_data = build_filter_feature_data(
property_data,
&postcode,
&address,
&property_type_raw,
&leasehold_freehold_raw,
&rooms_total,
&floor_area_sqm,
&asking_price,
&asking_price_per_sqm,
);
let grid = GridIndex::build(&lat, &lon, GRID_CELL_SIZE);
info!(rows = row_count, "Actual listings loaded");
@ -122,6 +143,7 @@ impl ActualListingData {
listing_status,
listing_date_iso,
features,
filter_feature_data,
grid,
})
}
@ -150,6 +172,201 @@ impl ActualListingData {
}
}
#[allow(clippy::too_many_arguments)]
fn build_filter_feature_data(
property_data: Option<&PropertyData>,
postcode: &[String],
address: &[Option<String>],
property_type: &[Option<String>],
leasehold_freehold: &[Option<String>],
rooms_total: &[Option<i32>],
floor_area_sqm: &[Option<f32>],
asking_price: &[Option<i64>],
asking_price_per_sqm: &[Option<f32>],
) -> Vec<u16> {
let Some(property_data) = property_data else {
return Vec::new();
};
let num_features = property_data.num_features;
let mut feature_data = vec![NAN_U16; postcode.len() * num_features];
let mut joined_rows = 0usize;
for (row, postcode_value) in postcode.iter().enumerate() {
let Some(address_value) = address[row]
.as_deref()
.map(str::trim)
.filter(|v| !v.is_empty())
else {
continue;
};
let query = format!("{address_value} {postcode_value}");
let Some(&property_row) = property_data.search_addresses(&query, 1).first() else {
continue;
};
if property_data.postcode(property_row) != postcode_value {
continue;
}
let dst = row * num_features;
let src = property_row * num_features;
feature_data[dst..dst + num_features]
.copy_from_slice(&property_data.feature_data[src..src + num_features]);
joined_rows += 1;
}
let quant = property_data.quant_ref();
overlay_numeric_feature(
&mut feature_data,
property_data,
&quant,
"Total floor area (sqm)",
floor_area_sqm.iter().copied(),
false,
);
overlay_numeric_feature(
&mut feature_data,
property_data,
&quant,
"Number of bedrooms & living rooms",
rooms_total.iter().map(|value| value.map(|v| v as f32)),
false,
);
overlay_numeric_feature(
&mut feature_data,
property_data,
&quant,
"Estimated current price",
asking_price.iter().map(|value| value.map(|v| v as f32)),
true,
);
overlay_numeric_feature(
&mut feature_data,
property_data,
&quant,
"Last known price",
asking_price.iter().map(|value| value.map(|v| v as f32)),
true,
);
overlay_numeric_feature(
&mut feature_data,
property_data,
&quant,
"Est. price per sqm",
asking_price_per_sqm.iter().copied(),
true,
);
overlay_numeric_feature(
&mut feature_data,
property_data,
&quant,
"Price per sqm",
asking_price_per_sqm.iter().copied(),
true,
);
overlay_enum_feature(
&mut feature_data,
property_data,
"Property type",
property_type.iter().map(Option::as_deref),
false,
);
overlay_enum_feature(
&mut feature_data,
property_data,
"Leasehold/Freehold",
leasehold_freehold.iter().map(Option::as_deref),
false,
);
info!(
rows = postcode.len(),
joined_rows, "Actual listings joined to property feature matrix"
);
feature_data
}
fn feature_index(property_data: &PropertyData, name: &str) -> Option<usize> {
property_data
.feature_names
.iter()
.position(|candidate| candidate == name)
}
fn overlay_numeric_feature<I>(
feature_data: &mut [u16],
property_data: &PropertyData,
quant: &QuantRef<'_>,
name: &str,
values: I,
clear_missing: bool,
) where
I: IntoIterator<Item = Option<f32>>,
{
let Some(feat_idx) = feature_index(property_data, name) else {
return;
};
if feat_idx >= property_data.num_numeric {
return;
}
let num_features = property_data.num_features;
for (row, value) in values.into_iter().enumerate() {
let dst = row * num_features + feat_idx;
match value {
Some(value) => feature_data[dst] = encode_numeric_value(quant, feat_idx, value),
None if clear_missing => feature_data[dst] = NAN_U16,
None => {}
}
}
}
fn overlay_enum_feature<'a, I>(
feature_data: &mut [u16],
property_data: &PropertyData,
name: &str,
values: I,
clear_missing: bool,
) where
I: IntoIterator<Item = Option<&'a str>>,
{
let Some(feat_idx) = feature_index(property_data, name) else {
return;
};
let Some(enum_values) = property_data.enum_values.get(&feat_idx) else {
return;
};
let num_features = property_data.num_features;
for (row, value) in values.into_iter().enumerate() {
let dst = row * num_features + feat_idx;
let encoded = value
.map(str::trim)
.filter(|text| !text.is_empty())
.and_then(|text| enum_values.iter().position(|candidate| candidate == text))
.map(|position| position as u16);
match encoded {
Some(value) => feature_data[dst] = value,
None if clear_missing => feature_data[dst] = NAN_U16,
None => {}
}
}
}
fn encode_numeric_value(quant: &QuantRef<'_>, feat_idx: usize, value: f32) -> u16 {
if !value.is_finite() {
return NAN_U16;
}
let range = quant.quant_range[feat_idx];
if range <= 0.0 {
return 0;
}
let normalized = (value - quant.quant_min[feat_idx]) / range;
(normalized * QUANT_SCALE).round().clamp(0.0, QUANT_SCALE) as u16
}
fn opt_to_string(values: &[Option<String>]) -> Vec<String> {
values
.iter()
@ -311,7 +528,7 @@ mod tests {
return;
};
let data = ActualListingData::load(&path).expect("listings load");
let data = ActualListingData::load_inner(&path, None).expect("listings load");
assert!(!data.lat.is_empty());
assert_eq!(data.lat.len(), data.lon.len());
assert_eq!(data.lat.len(), data.postcode.len());

View file

@ -30,16 +30,6 @@ const GROCERY_DASHBOARD_CATEGORIES: &[&str] = &[
"Budgens",
"Centra",
"Co-op",
"Central England Co-operative",
"Chelmsford Star Co-operative Society",
"East of England Co-operative",
"Heart of England Co-operative",
"Lincolnshire Co-operative",
"Midcounties Co-operative",
"Scottish Midland Co-operative",
"Tamworth Co-operative Society",
"The Radstock Co-operative Society",
"The Southern Co-operative",
"COOK",
"Costco",
"Dunnes Stores",
@ -104,10 +94,35 @@ fn add_category_filter_index(
}
}
fn canonical_poi_category(category: &str) -> &str {
match category {
"Allendale Co-operative Society"
| "Central England Co-operative"
| "Channel Islands Co-operative Society"
| "Chelmsford Star Co-operative Society"
| "Clydebank Co-operative"
| "Coniston Co-operative Society"
| "Co-op Food"
| "East of England Co-operative"
| "Heart of England Co-operative"
| "Langdale Co-operative Society"
| "Lincolnshire Co-operative"
| "Midcounties Co-operative"
| "Scottish Midland Co-operative"
| "Tamworth Co-operative Society"
| "The Co-operative Food"
| "The Co-operative Food PFS"
| "The Co-operative Group"
| "The Radstock Co-operative Society"
| "The Southern Co-operative" => "Co-op",
_ => category,
}
}
pub fn resolve_poi_category_filter(category_values: &[String], categories: &str) -> FxHashSet<u16> {
let mut selected = FxHashSet::default();
for part in categories.split(',') {
let category = part.trim();
let category = canonical_poi_category(part.trim());
if category.is_empty() {
continue;
}
@ -200,12 +215,18 @@ impl POIData {
let id_raw: Vec<String> = extract_str_col(&df, "id")?;
let name = extract_str_col(&df, "name")?;
let category_raw = extract_str_col(&df, "category")?;
let category_raw: Vec<String> = extract_str_col(&df, "category")?
.into_iter()
.map(|category| canonical_poi_category(&category).to_string())
.collect();
let group_raw = extract_str_col(&df, "group")?;
let lat = extract_f32_col(&df, "lat")?;
let lng = extract_f32_col(&df, "lng")?;
let emoji_raw = extract_str_col(&df, "emoji")?;
let icon_category_raw = extract_str_col(&df, "icon_category")?;
let icon_category_raw: Vec<String> = extract_str_col(&df, "icon_category")?
.into_iter()
.map(|category| canonical_poi_category(&category).to_string())
.collect();
// Pack POI IDs into a contiguous buffer
let total_id_bytes: usize = id_raw.iter().map(|s| s.len()).sum();
@ -351,4 +372,19 @@ mod tests {
assert!(selected.is_empty());
}
#[test]
fn coop_category_aliases_resolve_to_single_category() {
let values = vec!["Co-op".to_string(), "Tesco".to_string()];
let selected = resolve_poi_category_filter(
&values,
"Central England Co-operative,The Southern Co-operative",
);
assert!(selected.contains(&0));
assert_eq!(selected.len(), 1);
assert_eq!(canonical_poi_category("Lincolnshire Co-operative"), "Co-op");
assert_eq!(canonical_poi_category("Tesco"), "Tesco");
}
}

View file

@ -1014,22 +1014,6 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
];
/// Feature names that describe an individual property (price, size, type, etc.) rather
/// than the surrounding area. Use this to skip filters that should not exclude live
/// listings on the map even though they hide aggregated property rows.
pub fn property_level_feature_names() -> Vec<&'static str> {
const PROPERTY_GROUPS: &[&str] = &["Properties", "Property prices"];
FEATURE_GROUPS
.iter()
.filter(|group| PROPERTY_GROUPS.contains(&group.name))
.flat_map(|group| group.features.iter())
.map(|feature| match feature {
Feature::Numeric(c) => c.name,
Feature::Enum(c) => c.name,
})
.collect()
}
/// Flat ordered list of all numeric feature names (follows group order).
pub fn all_numeric_feature_names() -> Vec<&'static str> {
FEATURE_GROUPS

View file

@ -541,7 +541,7 @@ async fn main() -> anyhow::Result<()> {
bail!("Actual listings parquet not found: {}", path.display());
}
info!("Loading actual listings from {}", path.display());
let listings = data::ActualListingData::load(path)?;
let listings = data::ActualListingData::load(path, &property_data)?;
trim_allocator("actual listings load");
info!(rows = listings.lat.len(), "Actual listings loaded");
Some(Arc::new(listings))

View file

@ -1,16 +1,20 @@
use std::sync::Arc;
use axum::extract::{Query, State};
use axum::response::Json;
use axum::response::{IntoResponse, Json, Response};
use axum::Extension;
use rustc_hash::FxHashSet;
use serde::{Deserialize, Serialize};
use tracing::info;
use crate::api_error::ApiError;
use crate::auth::OptionalUser;
use crate::consts::NAN_U16;
use crate::data::ActualListing;
use crate::features::property_level_feature_names;
use crate::licensing::{check_license_bounds, resolve_share_code};
use crate::parsing::{
parse_filters_with_poi, require_bounds, row_passes_filters, row_passes_poi_filters,
ParsedEnumFilter, ParsedFilter,
};
use crate::state::{AppState, SharedState};
@ -25,6 +29,8 @@ pub struct ActualListingsParams {
travel: Option<String>,
/// Number of results to skip. Defaults to 0.
offset: Option<usize>,
/// Share-link code; grants bbox-scoped access for unlicensed users.
share: Option<String>,
}
#[derive(Serialize)]
@ -35,10 +41,24 @@ pub struct ActualListingsResponse {
pub truncated: bool,
}
const LISTING_LEVEL_FILTER_FEATURES: &[&str] = &[
"Property type",
"Leasehold/Freehold",
"Total floor area (sqm)",
"Number of bedrooms & living rooms",
"Estimated current price",
"Last known price",
"Est. price per sqm",
"Price per sqm",
];
const KEEP_UNKNOWN_LISTING_FILTER_FEATURES: &[&str] = &["Total floor area (sqm)"];
pub async fn get_actual_listings(
State(shared): State<Arc<SharedState>>,
Extension(user): Extension<OptionalUser>,
Query(params): Query<ActualListingsParams>,
) -> Result<Json<ActualListingsResponse>, ApiError> {
) -> Result<Json<ActualListingsResponse>, Response> {
let state = shared.load_state();
let offset = params.offset.unwrap_or(0);
let Some(actual_listings) = state.actual_listings.clone() else {
@ -49,11 +69,15 @@ pub async fn get_actual_listings(
truncated: false,
}));
};
let (south, west, north, east) = require_bounds(params.bounds).map_err(ApiError::from)?;
let (south, west, north, east) =
require_bounds(params.bounds).map_err(IntoResponse::into_response)?;
let share_bounds = resolve_share_code(&state, params.share.as_deref()).await;
check_license_bounds(&user.0, (south, west, north, east), share_bounds)?;
let quant = state.data.quant_ref();
let poi_quant = state.data.poi_metrics.quant_ref();
let (mut parsed_filters, mut parsed_enum_filters, parsed_poi_filters) = parse_filters_with_poi(
let (parsed_filters, parsed_enum_filters, parsed_poi_filters) = parse_filters_with_poi(
params.filters.as_deref(),
&state.feature_name_to_index,
&state.data.enum_values,
@ -61,40 +85,38 @@ pub async fn get_actual_listings(
&state.data.poi_metrics.name_to_index,
&poi_quant,
)
.map_err(ApiError::BadRequest)?;
.map_err(|err| ApiError::BadRequest(err).into_response())?;
// Drop property-level filters (price, sqm, build year, beds, type, etc.) so they
// don't hide live listings — those are individual-property concerns the user can
// judge from the pin itself. We only keep area/postcode-level filters here.
let property_level_idxs: FxHashSet<usize> = property_level_feature_names()
.into_iter()
.filter_map(|name| state.feature_name_to_index.get(name).copied())
.collect();
parsed_filters.retain(|f| !property_level_idxs.contains(&f.feat_idx));
parsed_enum_filters.retain(|f| !property_level_idxs.contains(&f.feat_idx));
let travel_entries = parse_optional_travel(params.travel.as_deref())
.map_err(|err| ApiError::BadRequest(err).into_response())?;
let travel_entries =
parse_optional_travel(params.travel.as_deref()).map_err(ApiError::BadRequest)?;
let listing_level_feature_idxs = listing_level_filter_feature_idxs(&state);
let keep_unknown_listing_filter_idxs = keep_unknown_listing_filter_feature_idxs(&state);
let (listing_filters, postcode_filters) =
split_numeric_filters(parsed_filters, &listing_level_feature_idxs);
let (listing_enum_filters, postcode_enum_filters) =
split_enum_filters(parsed_enum_filters, &listing_level_feature_idxs);
let has_area_filters = !parsed_filters.is_empty()
|| !parsed_enum_filters.is_empty()
let has_postcode_filters = !postcode_filters.is_empty()
|| !postcode_enum_filters.is_empty()
|| !parsed_poi_filters.is_empty()
|| !travel_entries.is_empty();
let has_listing_filters = !listing_filters.is_empty() || !listing_enum_filters.is_empty();
let state_clone = state.clone();
let response =
tokio::task::spawn_blocking(move || -> Result<ActualListingsResponse, String> {
let t0 = std::time::Instant::now();
let passing_postcodes = if has_area_filters {
let passing_postcodes = if has_postcode_filters {
Some(compute_passing_postcodes(
&state_clone,
south,
west,
north,
east,
&parsed_filters,
&parsed_enum_filters,
&postcode_filters,
&postcode_enum_filters,
&parsed_poi_filters,
&travel_entries,
)?)
@ -116,6 +138,18 @@ pub async fn get_actual_listings(
return None;
}
}
if has_listing_filters
&& !row_passes_listing_filters(
row,
&listing_filters,
&listing_enum_filters,
&actual_listings.filter_feature_data,
state_clone.data.num_features,
&keep_unknown_listing_filter_idxs,
)
{
return None;
}
Some(row)
})
.collect();
@ -142,7 +176,8 @@ pub async fn get_actual_listings(
total = total_matching,
total_in_bounds,
offset,
filtered = passing_postcodes.is_some(),
postcode_filtered = passing_postcodes.is_some(),
listing_filtered = has_listing_filters,
ms = format_args!("{:.1}", elapsed.as_secs_f64() * 1000.0),
"GET /api/actual-listings"
);
@ -155,12 +190,82 @@ pub async fn get_actual_listings(
})
})
.await
.map_err(|error| ApiError::Internal(error.to_string()))?
.map_err(ApiError::Internal)?;
.map_err(|error| ApiError::Internal(error.to_string()).into_response())?
.map_err(|err| ApiError::Internal(err).into_response())?;
Ok(Json(response))
}
fn listing_level_filter_feature_idxs(state: &AppState) -> FxHashSet<usize> {
feature_idxs(state, LISTING_LEVEL_FILTER_FEATURES)
}
fn keep_unknown_listing_filter_feature_idxs(state: &AppState) -> FxHashSet<usize> {
feature_idxs(state, KEEP_UNKNOWN_LISTING_FILTER_FEATURES)
}
fn feature_idxs(state: &AppState, names: &[&str]) -> FxHashSet<usize> {
names
.iter()
.filter_map(|name| state.feature_name_to_index.get(*name).copied())
.collect()
}
fn split_numeric_filters(
filters: Vec<ParsedFilter>,
listing_level_feature_idxs: &FxHashSet<usize>,
) -> (Vec<ParsedFilter>, Vec<ParsedFilter>) {
let mut listing_filters = Vec::new();
let mut postcode_filters = Vec::new();
for filter in filters {
if listing_level_feature_idxs.contains(&filter.feat_idx) {
listing_filters.push(filter);
} else {
postcode_filters.push(filter);
}
}
(listing_filters, postcode_filters)
}
fn split_enum_filters(
filters: Vec<ParsedEnumFilter>,
listing_level_feature_idxs: &FxHashSet<usize>,
) -> (Vec<ParsedEnumFilter>, Vec<ParsedEnumFilter>) {
let mut listing_filters = Vec::new();
let mut postcode_filters = Vec::new();
for filter in filters {
if listing_level_feature_idxs.contains(&filter.feat_idx) {
listing_filters.push(filter);
} else {
postcode_filters.push(filter);
}
}
(listing_filters, postcode_filters)
}
fn row_passes_listing_filters(
row: usize,
filters: &[ParsedFilter],
enum_filters: &[ParsedEnumFilter],
feature_data: &[u16],
num_features: usize,
keep_unknown_filter_idxs: &FxHashSet<usize>,
) -> bool {
let base = row * num_features;
filters.iter().all(|filter| {
let raw = feature_data[base + filter.feat_idx];
if raw == NAN_U16 {
keep_unknown_filter_idxs.contains(&filter.feat_idx)
} else {
raw >= filter.min_u16 && raw <= filter.max_u16
}
}) && enum_filters.iter().all(|filter| {
let raw = feature_data[base + filter.feat_idx];
raw != NAN_U16 && filter.allowed.contains(&raw)
})
}
#[allow(clippy::too_many_arguments)]
fn compute_passing_postcodes(
state: &AppState,
@ -224,3 +329,111 @@ fn compute_passing_postcodes(
Ok(passing)
}
#[cfg(test)]
mod tests {
use super::*;
fn numeric_filter(feat_idx: usize) -> ParsedFilter {
ParsedFilter {
feat_idx,
min_u16: 0,
max_u16: 100,
}
}
fn enum_filter(feat_idx: usize) -> ParsedEnumFilter {
ParsedEnumFilter {
feat_idx,
allowed: [0u16].into_iter().collect(),
}
}
#[test]
fn splits_actual_listing_filters_by_listing_native_features() {
let listing_level_feature_idxs: FxHashSet<usize> = [1usize, 3].into_iter().collect();
let (listing_filters, postcode_filters) = split_numeric_filters(
vec![numeric_filter(0), numeric_filter(1), numeric_filter(3)],
&listing_level_feature_idxs,
);
assert_eq!(
listing_filters
.iter()
.map(|filter| filter.feat_idx)
.collect::<Vec<_>>(),
vec![1, 3]
);
assert_eq!(
postcode_filters
.iter()
.map(|filter| filter.feat_idx)
.collect::<Vec<_>>(),
vec![0]
);
let (listing_enum_filters, postcode_enum_filters) = split_enum_filters(
vec![enum_filter(2), enum_filter(3)],
&listing_level_feature_idxs,
);
assert_eq!(
listing_enum_filters
.iter()
.map(|filter| filter.feat_idx)
.collect::<Vec<_>>(),
vec![3]
);
assert_eq!(
postcode_enum_filters
.iter()
.map(|filter| filter.feat_idx)
.collect::<Vec<_>>(),
vec![2]
);
}
#[test]
fn listing_floor_area_filter_keeps_unknown_values() {
let floor_area_filter = ParsedFilter {
feat_idx: 0,
min_u16: 10,
max_u16: 20,
};
let keep_unknown_filter_idxs: FxHashSet<usize> = [0usize].into_iter().collect();
assert!(row_passes_listing_filters(
0,
&[floor_area_filter],
&[],
&[NAN_U16],
1,
&keep_unknown_filter_idxs
));
assert!(!row_passes_listing_filters(
0,
&[ParsedFilter {
feat_idx: 0,
min_u16: 10,
max_u16: 20,
}],
&[],
&[9],
1,
&keep_unknown_filter_idxs
));
assert!(row_passes_listing_filters(
0,
&[ParsedFilter {
feat_idx: 0,
min_u16: 10,
max_u16: 20,
}],
&[],
&[15],
1,
&keep_unknown_filter_idxs
));
}
}